[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]

Re: [Cluster-devel] [GFS2 Patch] [Try 5] GFS2: Reduce file fragmentation



Hi,

With a slight tweek to take account of Abhi's patch, this is now in the
-nmw tree. A really good milestone to have reached. We should now take
the time to give this a really good test, and make sure that there are
no other corner cases lurking anywhere,

Steve.

On Wed, 2012-07-18 at 12:26 -0400, Bob Peterson wrote:
> Hi,
> 
> This is my fifth attempt at posting my file defragmentation patch
> for GFS2. This patch incorporates many of the things suggested by
> Steve Whitehouse in past emails. Notes:
> 
> 1. The reservations now keep track of starting point and free space,
>    and the starting point advances as blocks are claimed. This eliminates
>    a couple variables from the reservations structure and makes it
>    shorter, and the code smaller.
> 2. I'm using a spin_lock now to protect the reservation tree rather
>    than the mutex.
> 3. I'm not trying to implement an Orlov allocator for the directories
>    for this iteration. I'm now keeping the inode blocks closer together
>    by having the inode create function grab an allocation on behalf of
>    the new inode, using the parent directory.
> 
> Description:
> 
> This patch reduces GFS2 file fragmentation by pre-reserving blocks.
> 
> Regards,
> 
> Bob Peterson
> Red Hat File Systems
> 
> Signed-off-by: Bob Peterson <rpeterso redhat com> 
> ---
>  fs/gfs2/bmap.c       |    3 +
>  fs/gfs2/file.c       |   24 +--
>  fs/gfs2/incore.h     |   49 ++++-
>  fs/gfs2/inode.c      |   37 +++-
>  fs/gfs2/rgrp.c       |  576 +++++++++++++++++++++++++++++++++++++++++++++-----
>  fs/gfs2/rgrp.h       |   31 +++-
>  fs/gfs2/super.c      |    7 +
>  fs/gfs2/trace_gfs2.h |   59 +++++-
>  fs/gfs2/xattr.c      |   12 +
>  9 files changed, 707 insertions(+), 91 deletions(-)
> 
> diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
> index 6d957a8..49cd7dd 100644
> --- a/fs/gfs2/bmap.c
> +++ b/fs/gfs2/bmap.c
> @@ -785,6 +785,9 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
>  	if (error)
>  		goto out_rlist;
>  
> +	if (gfs2_rs_active(ip->i_res)) /* needs to be done with the rgrp glock held */
> +		gfs2_rs_deltree(ip->i_res);
> +
>  	error = gfs2_trans_begin(sdp, rg_blocks + RES_DINODE +
>  				 RES_INDIRECT + RES_STATFS + RES_QUOTA,
>  				 revokes);
> diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
> index 6fbf3cb..9f94832 100644
> --- a/fs/gfs2/file.c
> +++ b/fs/gfs2/file.c
> @@ -383,6 +383,9 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
>  	if (ret)
>  		return ret;
>  
> +	atomic_set(&ip->i_res->rs_sizehint,
> +		   PAGE_CACHE_SIZE / sdp->sd_sb.sb_bsize);
> +
>  	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
>  	ret = gfs2_glock_nq(&gh);
>  	if (ret)
> @@ -571,22 +574,15 @@ fail:
>  
>  static int gfs2_release(struct inode *inode, struct file *file)
>  {
> -	struct gfs2_sbd *sdp = inode->i_sb->s_fs_info;
> -	struct gfs2_file *fp;
>  	struct gfs2_inode *ip = GFS2_I(inode);
>  
> -	fp = file->private_data;
> +	kfree(file->private_data);
>  	file->private_data = NULL;
>  
> -	if ((file->f_mode & FMODE_WRITE) && ip->i_res &&
> +	if ((file->f_mode & FMODE_WRITE) &&
>  	    (atomic_read(&inode->i_writecount) == 1))
>  		gfs2_rs_delete(ip);
>  
> -	if (gfs2_assert_warn(sdp, fp))
> -		return -EIO;
> -
> -	kfree(fp);
> -
>  	return 0;
>  }
>  
> @@ -662,14 +658,18 @@ static ssize_t gfs2_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
>  				   unsigned long nr_segs, loff_t pos)
>  {
>  	struct file *file = iocb->ki_filp;
> +	size_t writesize = iov_length(iov, nr_segs);
>  	struct dentry *dentry = file->f_dentry;
>  	struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
> +	struct gfs2_sbd *sdp;
>  	int ret;
>  
> +	sdp = GFS2_SB(file->f_mapping->host);
>  	ret = gfs2_rs_alloc(ip);
>  	if (ret)
>  		return ret;
>  
> +	atomic_set(&ip->i_res->rs_sizehint, writesize / sdp->sd_sb.sb_bsize);
>  	if (file->f_flags & O_APPEND) {
>  		struct gfs2_holder gh;
>  
> @@ -795,6 +795,8 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset,
>  	if (unlikely(error))
>  		goto out_uninit;
>  
> +	atomic_set(&ip->i_res->rs_sizehint, len / sdp->sd_sb.sb_bsize);
> +
>  	while (len > 0) {
>  		if (len < bytes)
>  			bytes = len;
> @@ -803,10 +805,6 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset,
>  			offset += bytes;
>  			continue;
>  		}
> -		error = gfs2_rindex_update(sdp);
> -		if (error)
> -			goto out_unlock;
> -
>  		error = gfs2_quota_lock_check(ip);
>  		if (error)
>  			goto out_unlock;
> diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
> index dc73070..aaecc80 100644
> --- a/fs/gfs2/incore.h
> +++ b/fs/gfs2/incore.h
> @@ -84,6 +84,7 @@ struct gfs2_rgrpd {
>  	u32 rd_data;			/* num of data blocks in rgrp */
>  	u32 rd_bitbytes;		/* number of bytes in data bitmaps */
>  	u32 rd_free;
> +	u32 rd_reserved;                /* number of blocks reserved */
>  	u32 rd_free_clone;
>  	u32 rd_dinodes;
>  	u64 rd_igeneration;
> @@ -96,6 +97,9 @@ struct gfs2_rgrpd {
>  #define GFS2_RDF_UPTODATE	0x20000000 /* rg is up to date */
>  #define GFS2_RDF_ERROR		0x40000000 /* error in rg */
>  #define GFS2_RDF_MASK		0xf0000000 /* mask for internal flags */
> +	spinlock_t rd_rsspin;           /* protects reservation related vars */
> +	struct rb_root rd_rstree;       /* multi-block reservation tree */
> +	u32 rd_rs_cnt;                  /* count of current reservations */
>  };
>  
>  enum gfs2_state_bits {
> @@ -233,6 +237,38 @@ struct gfs2_holder {
>  	unsigned long gh_ip;
>  };
>  
> +/* Resource group multi-block reservation, in order of appearance:
> +
> +   Step 1. Function prepares to write, allocates a mb, sets the size hint.
> +   Step 2. User calls inplace_reserve to target an rgrp, sets the rgrp info
> +   Step 3. Function get_local_rgrp locks the rgrp, determines which bits to use
> +   Step 4. Bits are assigned from the rgrp based on either the reservation
> +           or wherever it can.
> +*/
> +
> +struct gfs2_blkreserv {
> +	/* components used during write (step 1): */
> +	atomic_t rs_sizehint;         /* hint of the write size */
> +
> +	/* components used during inplace_reserve (step 2): */
> +	u32 rs_requested; /* Filled in by caller of gfs2_inplace_reserve() */
> +
> +	/* components used during get_local_rgrp (step 3): */
> +	struct gfs2_rgrpd *rs_rgd;    /* pointer to the gfs2_rgrpd */
> +	struct gfs2_holder rs_rgd_gh; /* Filled in by get_local_rgrp */
> +	struct rb_node rs_node;       /* link to other block reservations */
> +
> +	/* components used during block searches and assignments (step 4): */
> +	struct gfs2_bitmap *rs_bi;    /* bitmap for the current allocation */
> +	u32 rs_biblk;                 /* start block relative to the bi */
> +	u32 rs_free;                  /* how many blocks are still free */
> +
> +	/* ancillary quota stuff */
> +	struct gfs2_quota_data *rs_qa_qd[2 * MAXQUOTAS];
> +	struct gfs2_holder rs_qa_qd_ghs[2 * MAXQUOTAS];
> +	unsigned int rs_qa_qd_num;
> +};
> +
>  enum {
>  	GLF_LOCK			= 1,
>  	GLF_DEMOTE			= 3,
> @@ -290,16 +326,6 @@ struct gfs2_glock {
>  
>  #define GFS2_MIN_LVB_SIZE 32	/* Min size of LVB that gfs2 supports */
>  
> -struct gfs2_blkreserv {
> -	u32 rs_requested; /* Filled in by caller of gfs2_inplace_reserve() */
> -	struct gfs2_holder rs_rgd_gh; /* Filled in by gfs2_inplace_reserve() */
> -
> -	/* ancillary quota stuff */
> -	struct gfs2_quota_data *rs_qa_qd[2 * MAXQUOTAS];
> -	struct gfs2_holder rs_qa_qd_ghs[2 * MAXQUOTAS];
> -	unsigned int rs_qa_qd_num;
> -};
> -
>  enum {
>  	GIF_INVALID		= 0,
>  	GIF_QD_LOCKED		= 1,
> @@ -307,7 +333,6 @@ enum {
>  	GIF_SW_PAGED		= 3,
>  };
>  
> -
>  struct gfs2_inode {
>  	struct inode i_inode;
>  	u64 i_no_addr;
> @@ -318,7 +343,7 @@ struct gfs2_inode {
>  	struct gfs2_glock *i_gl; /* Move into i_gh? */
>  	struct gfs2_holder i_iopen_gh;
>  	struct gfs2_holder i_gh; /* for prepare/commit_write only */
> -	struct gfs2_blkreserv *i_res; /* resource group block reservation */
> +	struct gfs2_blkreserv *i_res; /* rgrp multi-block reservation */
>  	struct gfs2_rgrpd *i_rgd;
>  	u64 i_goal;	/* goal block for allocations */
>  	struct rw_semaphore i_rw_mutex;
> diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
> index 2b035e0..c53c67e 100644
> --- a/fs/gfs2/inode.c
> +++ b/fs/gfs2/inode.c
> @@ -521,6 +521,9 @@ static int make_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
>  	int error;
>  
>  	munge_mode_uid_gid(dip, &mode, &uid, &gid);
> +	error = gfs2_rindex_update(sdp);
> +	if (error)
> +		return error;
>  
>  	error = gfs2_quota_lock(dip, uid, gid);
>  	if (error)
> @@ -551,6 +554,10 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
>  	struct buffer_head *dibh;
>  	int error;
>  
> +	error = gfs2_rindex_update(sdp);
> +	if (error)
> +		return error;
> +
>  	error = gfs2_quota_lock(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
>  	if (error)
>  		goto fail;
> @@ -596,7 +603,8 @@ fail_end_trans:
>  	gfs2_trans_end(sdp);
>  
>  fail_ipreserv:
> -	gfs2_inplace_release(dip);
> +	if (alloc_required)
> +		gfs2_inplace_release(dip);
>  
>  fail_quota_locks:
>  	gfs2_quota_unlock(dip);
> @@ -647,7 +655,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
>  	const struct qstr *name = &dentry->d_name;
>  	struct gfs2_holder ghs[2];
>  	struct inode *inode = NULL;
> -	struct gfs2_inode *dip = GFS2_I(dir);
> +	struct gfs2_inode *dip = GFS2_I(dir), *ip;
>  	struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
>  	struct gfs2_inum_host inum = { .no_addr = 0, .no_formal_ino = 0 };
>  	int error;
> @@ -657,6 +665,11 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
>  	if (!name->len || name->len > GFS2_FNAMESIZE)
>  		return -ENAMETOOLONG;
>  
> +	/* We need a reservation to allocate the new dinode block. The
> +	   directory ip temporarily points to the reservation, but this is
> +	   being done to get a set of contiguous blocks for the new dinode.
> +	   Since this is a create, we don't have a sizehint yet, so it will
> +	   have to use the minimum reservation size. */
>  	error = gfs2_rs_alloc(dip);
>  	if (error)
>  		return error;
> @@ -694,24 +707,29 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
>  	if (IS_ERR(inode))
>  		goto fail_gunlock2;
>  
> -	error = gfs2_inode_refresh(GFS2_I(inode));
> +	ip = GFS2_I(inode);
> +	error = gfs2_inode_refresh(ip);
>  	if (error)
>  		goto fail_gunlock2;
>  
> -	/* the new inode needs a reservation so it can allocate xattrs. */
> -	error = gfs2_rs_alloc(GFS2_I(inode));
> -	if (error)
> -		goto fail_gunlock2;
> +	/* The newly created inode needs a reservation so it can allocate
> +	   xattrs. At the same time, we want new blocks allocated to the new
> +	   dinode to be as contiguous as possible. Since we allocated the
> +	   dinode block under the directory's reservation, we transfer
> +	   ownership of that reservation to the new inode. The directory
> +	   doesn't need a reservation unless it needs a new allocation. */
> +	ip->i_res = dip->i_res;
> +	dip->i_res = NULL;
>  
>  	error = gfs2_acl_create(dip, inode);
>  	if (error)
>  		goto fail_gunlock2;
>  
> -	error = gfs2_security_init(dip, GFS2_I(inode), name);
> +	error = gfs2_security_init(dip, ip, name);
>  	if (error)
>  		goto fail_gunlock2;
>  
> -	error = link_dinode(dip, name, GFS2_I(inode));
> +	error = link_dinode(dip, name, ip);
>  	if (error)
>  		goto fail_gunlock2;
>  
> @@ -738,6 +756,7 @@ fail_gunlock:
>  		iput(inode);
>  	}
>  fail:
> +	gfs2_rs_delete(dip);
>  	if (bh)
>  		brelse(bh);
>  	return error;
> diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
> index e53d0a1..ad37fe2 100644
> --- a/fs/gfs2/rgrp.c
> +++ b/fs/gfs2/rgrp.c
> @@ -35,6 +35,9 @@
>  #define BFITNOENT ((u32)~0)
>  #define NO_BLOCK ((u64)~0)
>  
> +#define RSRV_CONTENTION_FACTOR 4
> +#define RGRP_RSRV_MAX_CONTENDERS 2
> +
>  #if BITS_PER_LONG == 32
>  #define LBITMASK   (0x55555555UL)
>  #define LBITSKIP55 (0x55555555UL)
> @@ -178,6 +181,57 @@ static inline u64 gfs2_bit_search(const __le64 *ptr, u64 mask, u8 state)
>  }
>  
>  /**
> + * rs_cmp - multi-block reservation range compare
> + * @blk: absolute file system block number of the new reservation
> + * @len: number of blocks in the new reservation
> + * @rs: existing reservation to compare against
> + *
> + * returns: 1 if the block range is beyond the reach of the reservation
> + *         -1 if the block range is before the start of the reservation
> + *          0 if the block range overlaps with the reservation
> + */
> +static inline int rs_cmp(u64 blk, u32 len, struct gfs2_blkreserv *rs)
> +{
> +	u64 startblk = gfs2_rs_startblk(rs);
> +
> +	if (blk >= startblk + rs->rs_free)
> +		return 1;
> +	if (blk + len - 1 < startblk)
> +		return -1;
> +	return 0;
> +}
> +
> +/**
> + * rs_find - Find a rgrp multi-block reservation that contains a given block
> + * @rgd: The rgrp
> + * @rgblk: The block we're looking for, relative to the rgrp
> + */
> +static struct gfs2_blkreserv *rs_find(struct gfs2_rgrpd *rgd, u32 rgblk)
> +{
> +	struct rb_node **newn;
> +	int rc;
> +	u64 fsblk = rgblk + rgd->rd_data0;
> +
> +	spin_lock(&rgd->rd_rsspin);
> +	newn = &rgd->rd_rstree.rb_node;
> +	while (*newn) {
> +		struct gfs2_blkreserv *cur =
> +			rb_entry(*newn, struct gfs2_blkreserv, rs_node);
> +		rc = rs_cmp(fsblk, 1, cur);
> +		if (rc < 0)
> +			newn = &((*newn)->rb_left);
> +		else if (rc > 0)
> +			newn = &((*newn)->rb_right);
> +		else {
> +			spin_unlock(&rgd->rd_rsspin);
> +			return cur;
> +		}
> +	}
> +	spin_unlock(&rgd->rd_rsspin);
> +	return NULL;
> +}
> +
> +/**
>   * gfs2_bitfit - Search an rgrp's bitmap buffer to find a bit-pair representing
>   *       a block in a given allocation state.
>   * @buf: the buffer that holds the bitmaps
> @@ -424,19 +478,93 @@ void gfs2_free_clones(struct gfs2_rgrpd *rgd)
>  int gfs2_rs_alloc(struct gfs2_inode *ip)
>  {
>  	int error = 0;
> +	struct gfs2_blkreserv *res;
> +
> +	if (ip->i_res)
> +		return 0;
> +
> +	res = kmem_cache_zalloc(gfs2_rsrv_cachep, GFP_NOFS);
> +	if (!res)
> +		error = -ENOMEM;
>  
>  	down_write(&ip->i_rw_mutex);
> -	if (!ip->i_res) {
> -		ip->i_res = kmem_cache_zalloc(gfs2_rsrv_cachep, GFP_NOFS);
> -		if (!ip->i_res)
> -			error = -ENOMEM;
> -	}
> +	if (ip->i_res)
> +		kmem_cache_free(gfs2_rsrv_cachep, res);
> +	else
> +		ip->i_res = res;
>  	up_write(&ip->i_rw_mutex);
>  	return error;
>  }
>  
> +static void dump_rs(struct seq_file *seq, struct gfs2_blkreserv *rs)
> +{
> +	gfs2_print_dbg(seq, "  r: %llu s:%llu b:%u f:%u\n",
> +		       rs->rs_rgd->rd_addr, gfs2_rs_startblk(rs), rs->rs_biblk,
> +		       rs->rs_free);
> +}
> +
>  /**
> - * gfs2_rs_delete - delete a reservation
> + * __rs_deltree - remove a multi-block reservation from the rgd tree
> + * @rs: The reservation to remove
> + *
> + */
> +static void __rs_deltree(struct gfs2_blkreserv *rs)
> +{
> +	struct gfs2_rgrpd *rgd;
> +
> +	if (!gfs2_rs_active(rs))
> +		return;
> +
> +	rgd = rs->rs_rgd;
> +	/* We can't do this: The reason is that when the rgrp is invalidated,
> +	   it's in the "middle" of acquiring the glock, but the HOLDER bit
> +	   isn't set yet:
> +	   BUG_ON(!gfs2_glock_is_locked_by_me(rs->rs_rgd->rd_gl));*/
> +	trace_gfs2_rs(NULL, rs, TRACE_RS_TREEDEL);
> +
> +	if (!RB_EMPTY_ROOT(&rgd->rd_rstree))
> +		rb_erase(&rs->rs_node, &rgd->rd_rstree);
> +	BUG_ON(!rgd->rd_rs_cnt);
> +	rgd->rd_rs_cnt--;
> +
> +	if (rs->rs_free) {
> +		/* return reserved blocks to the rgrp and the ip */
> +		BUG_ON(rs->rs_rgd->rd_reserved < rs->rs_free);
> +		rs->rs_rgd->rd_reserved -= rs->rs_free;
> +		rs->rs_free = 0;
> +		clear_bit(GBF_FULL, &rs->rs_bi->bi_flags);
> +		smp_mb__after_clear_bit();
> +	}
> +	/* We can't change any of the step 1 or step 2 components of the rs.
> +	   E.g. We can't set rs_rgd to NULL because the rgd glock is held and
> +	   dequeued through this pointer.
> +	   Can't: atomic_set(&rs->rs_sizehint, 0);
> +	   Can't: rs->rs_requested = 0;
> +	   Can't: rs->rs_rgd = NULL;*/
> +	rs->rs_bi = NULL;
> +	rs->rs_biblk = 0;
> +}
> +
> +/**
> + * gfs2_rs_deltree - remove a multi-block reservation from the rgd tree
> + * @rs: The reservation to remove
> + *
> + */
> +void gfs2_rs_deltree(struct gfs2_blkreserv *rs)
> +{
> +	struct gfs2_rgrpd *rgd;
> +
> +	if (!gfs2_rs_active(rs))
> +		return;
> +
> +	rgd = rs->rs_rgd;
> +	spin_lock(&rgd->rd_rsspin);
> +	__rs_deltree(rs);
> +	spin_unlock(&rgd->rd_rsspin);
> +}
> +
> +/**
> + * gfs2_rs_delete - delete a multi-block reservation
>   * @ip: The inode for this reservation
>   *
>   */
> @@ -444,12 +572,36 @@ void gfs2_rs_delete(struct gfs2_inode *ip)
>  {
>  	down_write(&ip->i_rw_mutex);
>  	if (ip->i_res) {
> +		gfs2_rs_deltree(ip->i_res);
> +		trace_gfs2_rs(ip, ip->i_res, TRACE_RS_DELETE);
> +		BUG_ON(ip->i_res->rs_free);
>  		kmem_cache_free(gfs2_rsrv_cachep, ip->i_res);
>  		ip->i_res = NULL;
>  	}
>  	up_write(&ip->i_rw_mutex);
>  }
>  
> +/**
> + * return_all_reservations - return all reserved blocks back to the rgrp.
> + * @rgd: the rgrp that needs its space back
> + *
> + * We previously reserved a bunch of blocks for allocation. Now we need to
> + * give them back. This leave the reservation structures in tact, but removes
> + * all of their corresponding "no-fly zones".
> + */
> +static void return_all_reservations(struct gfs2_rgrpd *rgd)
> +{
> +	struct rb_node *n;
> +	struct gfs2_blkreserv *rs;
> +
> +	spin_lock(&rgd->rd_rsspin);
> +	while ((n = rb_first(&rgd->rd_rstree))) {
> +		rs = rb_entry(n, struct gfs2_blkreserv, rs_node);
> +		__rs_deltree(rs);
> +	}
> +	spin_unlock(&rgd->rd_rsspin);
> +}
> +
>  void gfs2_clear_rgrpd(struct gfs2_sbd *sdp)
>  {
>  	struct rb_node *n;
> @@ -472,6 +624,7 @@ void gfs2_clear_rgrpd(struct gfs2_sbd *sdp)
>  
>  		gfs2_free_clones(rgd);
>  		kfree(rgd->rd_bits);
> +		return_all_reservations(rgd);
>  		kmem_cache_free(gfs2_rgrpd_cachep, rgd);
>  	}
>  }
> @@ -649,6 +802,7 @@ static int read_rindex_entry(struct gfs2_inode *ip)
>  	rgd->rd_data0 = be64_to_cpu(buf.ri_data0);
>  	rgd->rd_data = be32_to_cpu(buf.ri_data);
>  	rgd->rd_bitbytes = be32_to_cpu(buf.ri_bitbytes);
> +	spin_lock_init(&rgd->rd_rsspin);
>  
>  	error = compute_bitstructs(rgd);
>  	if (error)
> @@ -1115,29 +1269,212 @@ out:
>  }
>  
>  /**
> + * rs_insert - insert a new multi-block reservation into the rgrp's rb_tree
> + * @bi: the bitmap with the blocks
> + * @ip: the inode structure
> + * @biblk: the 32-bit block number relative to the start of the bitmap
> + * @amount: the number of blocks to reserve
> + *
> + * Returns: NULL - reservation was already taken, so not inserted
> + *          pointer to the inserted reservation
> + */
> +static struct gfs2_blkreserv *rs_insert(struct gfs2_bitmap *bi,
> +				       struct gfs2_inode *ip, u32 biblk,
> +				       int amount)
> +{
> +	struct rb_node **newn, *parent = NULL;
> +	int rc;
> +	struct gfs2_blkreserv *rs = ip->i_res;
> +	struct gfs2_rgrpd *rgd = rs->rs_rgd;
> +	u64 fsblock = gfs2_bi2rgd_blk(bi, biblk) + rgd->rd_data0;
> +
> +	spin_lock(&rgd->rd_rsspin);
> +	newn = &rgd->rd_rstree.rb_node;
> +	BUG_ON(!ip->i_res);
> +	BUG_ON(gfs2_rs_active(rs));
> +	/* Figure out where to put new node */
> +	/*BUG_ON(!gfs2_glock_is_locked_by_me(rgd->rd_gl));*/
> +	while (*newn) {
> +		struct gfs2_blkreserv *cur =
> +			rb_entry(*newn, struct gfs2_blkreserv, rs_node);
> +
> +		parent = *newn;
> +		rc = rs_cmp(fsblock, amount, cur);
> +		if (rc > 0)
> +			newn = &((*newn)->rb_right);
> +		else if (rc < 0)
> +			newn = &((*newn)->rb_left);
> +		else {
> +			spin_unlock(&rgd->rd_rsspin);
> +			return NULL; /* reservation already in use */
> +		}
> +	}
> +
> +	/* Do our reservation work */
> +	rs = ip->i_res;
> +	rs->rs_free = amount;
> +	rs->rs_biblk = biblk;
> +	rs->rs_bi = bi;
> +	rb_link_node(&rs->rs_node, parent, newn);
> +	rb_insert_color(&rs->rs_node, &rgd->rd_rstree);
> +
> +	/* Do our inode accounting for the reservation */
> +	/*BUG_ON(!gfs2_glock_is_locked_by_me(ip->i_gl));*/
> +
> +	/* Do our rgrp accounting for the reservation */
> +	rgd->rd_reserved += amount; /* blocks reserved */
> +	rgd->rd_rs_cnt++; /* number of in-tree reservations */
> +	spin_unlock(&rgd->rd_rsspin);
> +	trace_gfs2_rs(ip, rs, TRACE_RS_INSERT);
> +	return rs;
> +}
> +
> +/**
> + * unclaimed_blocks - return number of blocks that aren't spoken for
> + */
> +static u32 unclaimed_blocks(struct gfs2_rgrpd *rgd)
> +{
> +	return rgd->rd_free_clone - rgd->rd_reserved;
> +}
> +
> +/**
> + * rg_mblk_search - find a group of multiple free blocks
> + * @rgd: the resource group descriptor
> + * @rs: the block reservation
> + * @ip: pointer to the inode for which we're reserving blocks
> + *
> + * This is very similar to rgblk_search, except we're looking for whole
> + * 64-bit words that represent a chunk of 32 free blocks. I'm only focusing
> + * on aligned dwords for speed's sake.
> + *
> + * Returns: 0 if successful or BFITNOENT if there isn't enough free space
> + */
> +
> +static int rg_mblk_search(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip)
> +{
> +	struct gfs2_bitmap *bi = rgd->rd_bits;
> +	const u32 length = rgd->rd_length;
> +	u32 blk;
> +	unsigned int buf, x, search_bytes;
> +	u8 *buffer = NULL;
> +	u8 *ptr, *end, *nonzero;
> +	u32 goal, rsv_bytes;
> +	struct gfs2_blkreserv *rs;
> +	u32 best_rs_bytes, unclaimed;
> +	int best_rs_blocks;
> +
> +	/* Find bitmap block that contains bits for goal block */
> +	if (rgrp_contains_block(rgd, ip->i_goal))
> +		goal = ip->i_goal - rgd->rd_data0;
> +	else
> +		goal = rgd->rd_last_alloc;
> +	for (buf = 0; buf < length; buf++) {
> +		bi = rgd->rd_bits + buf;
> +		/* Convert scope of "goal" from rgrp-wide to within
> +		   found bit block */
> +		if (goal < (bi->bi_start + bi->bi_len) * GFS2_NBBY) {
> +			goal -= bi->bi_start * GFS2_NBBY;
> +			goto do_search;
> +		}
> +	}
> +	buf = 0;
> +	goal = 0;
> +
> +do_search:
> +	best_rs_blocks = max_t(int, atomic_read(&ip->i_res->rs_sizehint),
> +			       (RGRP_RSRV_MINBLKS * rgd->rd_length));
> +	best_rs_bytes = (best_rs_blocks *
> +			 (1 + (RSRV_CONTENTION_FACTOR * rgd->rd_rs_cnt))) /
> +		GFS2_NBBY; /* 1 + is for our not-yet-created reservation */
> +	best_rs_bytes = ALIGN(best_rs_bytes, sizeof(u64));
> +	unclaimed = unclaimed_blocks(rgd);
> +	if (best_rs_bytes * GFS2_NBBY > unclaimed)
> +		best_rs_bytes = unclaimed >> GFS2_BIT_SIZE;
> +
> +	for (x = 0; x <= length; x++) {
> +		bi = rgd->rd_bits + buf;
> +
> +		if (test_bit(GBF_FULL, &bi->bi_flags))
> +			goto skip;
> +
> +		WARN_ON(!buffer_uptodate(bi->bi_bh));
> +		if (bi->bi_clone)
> +			buffer = bi->bi_clone + bi->bi_offset;
> +		else
> +			buffer = bi->bi_bh->b_data + bi->bi_offset;
> +
> +		/* We have to keep the reservations aligned on u64 boundaries
> +		   otherwise we could get situations where a byte can't be
> +		   used because it's after a reservation, but a free bit still
> +		   is within the reservation's area. */
> +		ptr = buffer + ALIGN(goal >> GFS2_BIT_SIZE, sizeof(u64));
> +		end = (buffer + bi->bi_len);
> +		while (ptr < end) {
> +			rsv_bytes = 0;
> +			if ((ptr + best_rs_bytes) <= end)
> +				search_bytes = best_rs_bytes;
> +			else
> +				search_bytes = end - ptr;
> +			BUG_ON(!search_bytes);
> +			nonzero = memchr_inv(ptr, 0, search_bytes);
> +			/* If the lot is all zeroes, reserve the whole size. If
> +			   there's enough zeroes to satisfy the request, use
> +			   what we can. If there's not enough, keep looking. */
> +			if (nonzero == NULL)
> +				rsv_bytes = search_bytes;
> +			else if ((nonzero - ptr) * GFS2_NBBY >=
> +				 ip->i_res->rs_requested)
> +				rsv_bytes = (nonzero - ptr);
> +
> +			if (rsv_bytes) {
> +				blk = ((ptr - buffer) * GFS2_NBBY);
> +				BUG_ON(blk >= bi->bi_len * GFS2_NBBY);
> +				rs = rs_insert(bi, ip, blk,
> +					       rsv_bytes * GFS2_NBBY);
> +				if (IS_ERR(rs))
> +					return PTR_ERR(rs);
> +				if (rs)
> +					return 0;
> +			}
> +			ptr += ALIGN(search_bytes, sizeof(u64));
> +		}
> +skip:
> +		/* Try next bitmap block (wrap back to rgrp header
> +		   if at end) */
> +		buf++;
> +		buf %= length;
> +		goal = 0;
> +	}
> +
> +	return BFITNOENT;
> +}
> +
> +/**
>   * try_rgrp_fit - See if a given reservation will fit in a given RG
>   * @rgd: the RG data
>   * @ip: the inode
>   *
>   * If there's room for the requested blocks to be allocated from the RG:
> + * This will try to get a multi-block reservation first, and if that doesn't
> + * fit, it will take what it can.
>   *
>   * Returns: 1 on success (it fits), 0 on failure (it doesn't fit)
>   */
>  
> -static int try_rgrp_fit(const struct gfs2_rgrpd *rgd, const struct gfs2_inode *ip)
> +static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip)
>  {
> -	const struct gfs2_blkreserv *rs = ip->i_res;
> +	struct gfs2_blkreserv *rs = ip->i_res;
>  
>  	if (rgd->rd_flags & (GFS2_RGF_NOALLOC | GFS2_RDF_ERROR))
>  		return 0;
> -	if (rgd->rd_free_clone >= rs->rs_requested)
> +	/* Look for a multi-block reservation. */
> +	if (unclaimed_blocks(rgd) >= RGRP_RSRV_MINBLKS &&
> +	    rg_mblk_search(rgd, ip) != BFITNOENT)
> +		return 1;
> +	if (unclaimed_blocks(rgd) >= rs->rs_requested)
>  		return 1;
> -	return 0;
> -}
>  
> -static inline u32 gfs2_bi2rgd_blk(struct gfs2_bitmap *bi, u32 blk)
> -{
> -	return (bi->bi_start * GFS2_NBBY) + blk;
> +	return 0;
>  }
>  
>  /**
> @@ -1217,7 +1554,7 @@ static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip
>  int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested)
>  {
>  	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
> -	struct gfs2_rgrpd *rgd, *begin = NULL;
> +	struct gfs2_rgrpd *begin = NULL;
>  	struct gfs2_blkreserv *rs = ip->i_res;
>  	int error = 0, rg_locked, flags = LM_FLAG_TRY;
>  	u64 last_unlinked = NO_BLOCK;
> @@ -1225,32 +1562,40 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested)
>  
>  	if (sdp->sd_args.ar_rgrplvb)
>  		flags |= GL_SKIP;
> -	rs = ip->i_res;
>  	rs->rs_requested = requested;
>  	if (gfs2_assert_warn(sdp, requested)) {
>  		error = -EINVAL;
>  		goto out;
>  	}
> -
> -	if (ip->i_rgd && rgrp_contains_block(ip->i_rgd, ip->i_goal))
> -		rgd = begin = ip->i_rgd;
> -	else
> -		rgd = begin = gfs2_blk2rgrpd(sdp, ip->i_goal, 1);
> -
> -	if (rgd == NULL)
> +	if (gfs2_rs_active(rs)) {
> +		begin = rs->rs_rgd;
> +		flags = 0; /* Yoda: Do or do not. There is no try */
> +	} else if (ip->i_rgd && rgrp_contains_block(ip->i_rgd, ip->i_goal)) {
> +		rs->rs_rgd = begin = ip->i_rgd;
> +	} else {
> +		rs->rs_rgd = begin = gfs2_blk2rgrpd(sdp, ip->i_goal, 1);
> +	}
> +	if (rs->rs_rgd == NULL)
>  		return -EBADSLT;
>  
>  	while (loops < 3) {
>  		rg_locked = 0;
>  
> -		if (gfs2_glock_is_locked_by_me(rgd->rd_gl)) {
> +		if (gfs2_glock_is_locked_by_me(rs->rs_rgd->rd_gl)) {
>  			rg_locked = 1;
>  			error = 0;
> +		} else if (!loops && !gfs2_rs_active(rs) &&
> +			   rs->rs_rgd->rd_rs_cnt > RGRP_RSRV_MAX_CONTENDERS) {
> +			/* If the rgrp already is maxed out for contenders,
> +			   we can eliminate it as a "first pass" without even
> +			   requesting the rgrp glock. */
> +			error = GLR_TRYFAILED;
>  		} else {
> -			error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE,
> -						   flags, &rs->rs_rgd_gh);
> +			error = gfs2_glock_nq_init(rs->rs_rgd->rd_gl,
> +						   LM_ST_EXCLUSIVE, flags,
> +						   &rs->rs_rgd_gh);
>  			if (!error && sdp->sd_args.ar_rgrplvb) {
> -				error = update_rgrp_lvb(rgd);
> +				error = update_rgrp_lvb(rs->rs_rgd);
>  				if (error) {
>  					gfs2_glock_dq_uninit(&rs->rs_rgd_gh);
>  					return error;
> @@ -1259,24 +1604,36 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested)
>  		}
>  		switch (error) {
>  		case 0:
> -			if (try_rgrp_fit(rgd, ip)) {
> +			if (gfs2_rs_active(rs)) {
> +				if (unclaimed_blocks(rs->rs_rgd) +
> +				    rs->rs_free >= rs->rs_requested) {
> +					ip->i_rgd = rs->rs_rgd;
> +					return 0;
> +				}
> +				/* We have a multi-block reservation, but the
> +				   rgrp doesn't have enough free blocks to
> +				   satisfy the request. Free the reservation
> +				   and look for a suitable rgrp. */
> +				gfs2_rs_deltree(rs);
> +			}
> +			if (try_rgrp_fit(rs->rs_rgd, ip)) {
>  				if (sdp->sd_args.ar_rgrplvb)
> -					gfs2_rgrp_bh_get(rgd);
> -				ip->i_rgd = rgd;
> +					gfs2_rgrp_bh_get(rs->rs_rgd);
> +				ip->i_rgd = rs->rs_rgd;
>  				return 0;
>  			}
> -			if (rgd->rd_flags & GFS2_RDF_CHECK) {
> +			if (rs->rs_rgd->rd_flags & GFS2_RDF_CHECK) {
>  				if (sdp->sd_args.ar_rgrplvb)
> -					gfs2_rgrp_bh_get(rgd);
> -				try_rgrp_unlink(rgd, &last_unlinked,
> +					gfs2_rgrp_bh_get(rs->rs_rgd);
> +				try_rgrp_unlink(rs->rs_rgd, &last_unlinked,
>  						ip->i_no_addr);
>  			}
>  			if (!rg_locked)
>  				gfs2_glock_dq_uninit(&rs->rs_rgd_gh);
>  			/* fall through */
>  		case GLR_TRYFAILED:
> -			rgd = gfs2_rgrpd_get_next(rgd);
> -			if (rgd != begin) /* If we didn't wrap */
> +			rs->rs_rgd = gfs2_rgrpd_get_next(rs->rs_rgd);
> +			if (rs->rs_rgd != begin) /* If we didn't wrap */
>  				break;
>  
>  			flags &= ~LM_FLAG_TRY;
> @@ -1314,6 +1671,12 @@ void gfs2_inplace_release(struct gfs2_inode *ip)
>  {
>  	struct gfs2_blkreserv *rs = ip->i_res;
>  
> +	if (!rs)
> +		return;
> +
> +	if (!rs->rs_free)
> +		gfs2_rs_deltree(rs);
> +
>  	if (rs->rs_rgd_gh.gh_gl)
>  		gfs2_glock_dq_uninit(&rs->rs_rgd_gh);
>  	rs->rs_requested = 0;
> @@ -1412,7 +1775,27 @@ do_search:
>  		if (state != GFS2_BLKST_UNLINKED && bi->bi_clone)
>  			buffer = bi->bi_clone + bi->bi_offset;
>  
> -		biblk = gfs2_bitfit(buffer, bi->bi_len, goal, state);
> +		while (1) {
> +			struct gfs2_blkreserv *rs;
> +			u32 rgblk;
> +
> +			biblk = gfs2_bitfit(buffer, bi->bi_len, goal, state);
> +			if (biblk == BFITNOENT)
> +				break;
> +			/* Check if this block is reserved() */
> +			rgblk = gfs2_bi2rgd_blk(bi, biblk);
> +			rs = rs_find(rgd, rgblk);
> +			if (rs == NULL)
> +				break;
> +
> +			BUG_ON(rs->rs_bi != bi);
> +			biblk = BFITNOENT;
> +			/* This should jump to the first block after the
> +			   reservation. */
> +			goal = rs->rs_biblk + rs->rs_free;
> +			if (goal >= bi->bi_len * GFS2_NBBY)
> +				break;
> +		}
>  		if (biblk != BFITNOENT)
>  			break;
>  
> @@ -1448,8 +1831,9 @@ static u64 gfs2_alloc_extent(struct gfs2_rgrpd *rgd, struct gfs2_bitmap *bi,
>  			     u32 blk, bool dinode, unsigned int *n)
>  {
>  	const unsigned int elen = *n;
> -	u32 goal;
> +	u32 goal, rgblk;
>  	const u8 *buffer = NULL;
> +	struct gfs2_blkreserv *rs;
>  
>  	*n = 0;
>  	buffer = bi->bi_bh->b_data + bi->bi_offset;
> @@ -1462,6 +1846,10 @@ static u64 gfs2_alloc_extent(struct gfs2_rgrpd *rgd, struct gfs2_bitmap *bi,
>  		goal++;
>  		if (goal >= (bi->bi_len * GFS2_NBBY))
>  			break;
> +		rgblk = gfs2_bi2rgd_blk(bi, goal);
> +		rs = rs_find(rgd, rgblk);
> +		if (rs) /* Oops, we bumped into someone's reservation */
> +			break;
>  		if (gfs2_testbit(rgd, buffer, bi->bi_len, goal) !=
>  		    GFS2_BLKST_FREE)
>  			break;
> @@ -1537,12 +1925,22 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart,
>  
>  int gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl)
>  {
> -	const struct gfs2_rgrpd *rgd = gl->gl_object;
> +	struct gfs2_rgrpd *rgd = gl->gl_object;
> +	struct gfs2_blkreserv *trs;
> +	const struct rb_node *n;
> +
>  	if (rgd == NULL)
>  		return 0;
> -	gfs2_print_dbg(seq, " R: n:%llu f:%02x b:%u/%u i:%u\n",
> +	gfs2_print_dbg(seq, " R: n:%llu f:%02x b:%u/%u i:%u r:%u\n",
>  		       (unsigned long long)rgd->rd_addr, rgd->rd_flags,
> -		       rgd->rd_free, rgd->rd_free_clone, rgd->rd_dinodes);
> +		       rgd->rd_free, rgd->rd_free_clone, rgd->rd_dinodes,
> +		       rgd->rd_reserved);
> +	spin_lock(&rgd->rd_rsspin);
> +	for (n = rb_first(&rgd->rd_rstree); n; n = rb_next(&trs->rs_node)) {
> +		trs = rb_entry(n, struct gfs2_blkreserv, rs_node);
> +		dump_rs(seq, trs);
> +	}
> +	spin_unlock(&rgd->rd_rsspin);
>  	return 0;
>  }
>  
> @@ -1557,10 +1955,63 @@ static void gfs2_rgrp_error(struct gfs2_rgrpd *rgd)
>  }
>  
>  /**
> + * claim_reserved_blks - Claim previously reserved blocks
> + * @ip: the inode that's claiming the reservation
> + * @dinode: 1 if this block is a dinode block, otherwise data block
> + * @nblocks: desired extent length
> + *
> + * Lay claim to previously allocated block reservation blocks.
> + * Returns: Starting block number of the blocks claimed.
> + * Sets *nblocks to the actual extent length allocated.
> + */
> +static u64 claim_reserved_blks(struct gfs2_inode *ip, bool dinode,
> +			       unsigned int *nblocks)
> +{
> +	struct gfs2_blkreserv *rs = ip->i_res;
> +	struct gfs2_rgrpd *rgd = rs->rs_rgd;
> +	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
> +	struct gfs2_bitmap *bi;
> +	u64 start_block = gfs2_rs_startblk(rs);
> +	const unsigned int elen = *nblocks;
> +
> +	/*BUG_ON(!gfs2_glock_is_locked_by_me(ip->i_gl));*/
> +	gfs2_assert_withdraw(sdp, rgd);
> +	/*BUG_ON(!gfs2_glock_is_locked_by_me(rgd->rd_gl));*/
> +	bi = rs->rs_bi;
> +	gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1);
> +
> +	for (*nblocks = 0; *nblocks < elen && rs->rs_free; (*nblocks)++) {
> +		/* Make sure the bitmap hasn't changed */
> +		gfs2_setbit(rgd, bi->bi_clone, bi, rs->rs_biblk,
> +			    dinode ? GFS2_BLKST_DINODE : GFS2_BLKST_USED);
> +		rs->rs_biblk++;
> +		rs->rs_free--;
> +
> +		BUG_ON(!rgd->rd_reserved);
> +		rgd->rd_reserved--;
> +		dinode = false;
> +		trace_gfs2_rs(ip, rs, TRACE_RS_CLAIM);
> +	}
> +
> +	if (!rs->rs_free) {
> +		struct gfs2_rgrpd *rgd = ip->i_res->rs_rgd;
> +
> +		gfs2_rs_deltree(rs);
> +		/* -nblocks because we haven't returned to do the math yet.
> +		   I'm doing the math backwards to prevent negative numbers,
> +		   but think of it as:
> +		   if (unclaimed_blocks(rgd) - *nblocks >= RGRP_RSRV_MINBLKS */
> +		if (unclaimed_blocks(rgd) >= RGRP_RSRV_MINBLKS + *nblocks)
> +			rg_mblk_search(rgd, ip);
> +	}
> +	return start_block;
> +}
> +
> +/**
>   * gfs2_alloc_blocks - Allocate one or more blocks of data and/or a dinode
>   * @ip: the inode to allocate the block for
>   * @bn: Used to return the starting block number
> - * @ndata: requested number of blocks/extent length (value/result)
> + * @nblocks: requested number of blocks/extent length (value/result)
>   * @dinode: 1 if we're allocating a dinode block, else 0
>   * @generation: the generation number of the inode
>   *
> @@ -1585,20 +2036,34 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks,
>  	if (ip->i_res->rs_requested == 0)
>  		return -ECANCELED;
>  
> -	rgd = ip->i_rgd;
> -
> -	if (!dinode && rgrp_contains_block(rgd, ip->i_goal))
> -		goal = ip->i_goal - rgd->rd_data0;
> -	else
> -		goal = rgd->rd_last_alloc;
> -
> -	blk = rgblk_search(rgd, goal, GFS2_BLKST_FREE, &bi);
> +	/* Check if we have a multi-block reservation, and if so, claim the
> +	   next free block from it. */
> +	if (gfs2_rs_active(ip->i_res)) {
> +		BUG_ON(!ip->i_res->rs_free);
> +		rgd = ip->i_res->rs_rgd;
> +		block = claim_reserved_blks(ip, dinode, nblocks);
> +	} else {
> +		rgd = ip->i_rgd;
>  
> -	/* Since all blocks are reserved in advance, this shouldn't happen */
> -	if (blk == BFITNOENT)
> -		goto rgrp_error;
> +		if (!dinode && rgrp_contains_block(rgd, ip->i_goal))
> +			goal = ip->i_goal - rgd->rd_data0;
> +		else
> +			goal = rgd->rd_last_alloc;
> +
> +		blk = rgblk_search(rgd, goal, GFS2_BLKST_FREE, &bi);
> +
> +		/* Since all blocks are reserved in advance, this shouldn't
> +		   happen */
> +		if (blk == BFITNOENT) {
> +			printk(KERN_WARNING "BFITNOENT, nblocks=%u\n",
> +			       *nblocks);
> +			printk(KERN_WARNING "FULL=%d\n",
> +			       test_bit(GBF_FULL, &rgd->rd_bits->bi_flags));
> +			goto rgrp_error;
> +		}
>  
> -	block = gfs2_alloc_extent(rgd, bi, blk, dinode, nblocks);
> +		block = gfs2_alloc_extent(rgd, bi, blk, dinode, nblocks);
> +	}
>  	ndata = *nblocks;
>  	if (dinode)
>  		ndata--;
> @@ -1615,8 +2080,10 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks,
>  			brelse(dibh);
>  		}
>  	}
> -	if (rgd->rd_free < *nblocks)
> +	if (rgd->rd_free < *nblocks) {
> +		printk(KERN_WARNING "nblocks=%u\n", *nblocks);
>  		goto rgrp_error;
> +	}
>  
>  	rgd->rd_free -= *nblocks;
>  	if (dinode) {
> @@ -1876,6 +2343,7 @@ void gfs2_rlist_free(struct gfs2_rgrp_list *rlist)
>  		for (x = 0; x < rlist->rl_rgrps; x++)
>  			gfs2_holder_uninit(&rlist->rl_ghs[x]);
>  		kfree(rlist->rl_ghs);
> +		rlist->rl_ghs = NULL;
>  	}
>  }
>  
> diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
> index 5d8314d..ca6e267 100644
> --- a/fs/gfs2/rgrp.h
> +++ b/fs/gfs2/rgrp.h
> @@ -13,6 +13,14 @@
>  #include <linux/slab.h>
>  #include <linux/uaccess.h>
>  
> +/* Since each block in the file system is represented by two bits in the
> + * bitmap, one 64-bit word in the bitmap will represent 32 blocks.
> + * By reserving 32 blocks at a time, we can optimize / shortcut how we search
> + * through the bitmaps by looking a word at a time.
> + */
> +#define RGRP_RSRV_MINBYTES 8
> +#define RGRP_RSRV_MINBLKS ((u32)(RGRP_RSRV_MINBYTES * GFS2_NBBY))
> +
>  struct gfs2_rgrpd;
>  struct gfs2_sbd;
>  struct gfs2_holder;
> @@ -29,6 +37,8 @@ extern void gfs2_free_clones(struct gfs2_rgrpd *rgd);
>  extern int gfs2_rgrp_go_lock(struct gfs2_holder *gh);
>  extern void gfs2_rgrp_go_unlock(struct gfs2_holder *gh);
>  
> +extern struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip);
> +
>  extern int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested);
>  extern void gfs2_inplace_release(struct gfs2_inode *ip);
>  
> @@ -36,6 +46,7 @@ extern int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *n,
>  			     bool dinode, u64 *generation);
>  
>  extern int gfs2_rs_alloc(struct gfs2_inode *ip);
> +extern void gfs2_rs_deltree(struct gfs2_blkreserv *rs);
>  extern void gfs2_rs_delete(struct gfs2_inode *ip);
>  extern void __gfs2_free_blocks(struct gfs2_inode *ip, u64 bstart, u32 blen, int meta);
>  extern void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen);
> @@ -62,7 +73,7 @@ extern int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
>  				   const struct gfs2_bitmap *bi, unsigned minlen, u64 *ptrimmed);
>  extern int gfs2_fitrim(struct file *filp, void __user *argp);
>  
> -/* This is how to tell if a reservation is "inplace" reserved: */
> +/* This is how to tell if a multi-block reservation is "inplace" reserved: */
>  static inline int gfs2_mb_reserved(struct gfs2_inode *ip)
>  {
>  	if (ip->i_res && ip->i_res->rs_requested)
> @@ -70,4 +81,22 @@ static inline int gfs2_mb_reserved(struct gfs2_inode *ip)
>  	return 0;
>  }
>  
> +/* This is how to tell if a multi-block reservation is in the rgrp tree: */
> +static inline int gfs2_rs_active(struct gfs2_blkreserv *rs)
> +{
> +	if (rs && rs->rs_bi)
> +		return 1;
> +	return 0;
> +}
> +
> +static inline u32 gfs2_bi2rgd_blk(const struct gfs2_bitmap *bi, u32 blk)
> +{
> +	return (bi->bi_start * GFS2_NBBY) + blk;
> +}
> +
> +static inline u64 gfs2_rs_startblk(const struct gfs2_blkreserv *rs)
> +{
> +	return gfs2_bi2rgd_blk(rs->rs_bi, rs->rs_biblk) + rs->rs_rgd->rd_data0;
> +}
> +
>  #endif /* __RGRP_DOT_H__ */
> diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
> index 7880687..b1502c4 100644
> --- a/fs/gfs2/super.c
> +++ b/fs/gfs2/super.c
> @@ -1420,6 +1420,10 @@ static int gfs2_dinode_dealloc(struct gfs2_inode *ip)
>  		return -EIO;
>  	}
>  
> +	error = gfs2_rindex_update(sdp);
> +	if (error)
> +		return error;
> +
>  	error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
>  	if (error)
>  		return error;
> @@ -1550,6 +1554,9 @@ out_truncate:
>  
>  out_unlock:
>  	/* Error path for case 1 */
> +	if (gfs2_rs_active(ip->i_res))
> +		gfs2_rs_deltree(ip->i_res);
> +
>  	if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags))
>  		gfs2_glock_dq(&ip->i_iopen_gh);
>  	gfs2_holder_uninit(&ip->i_iopen_gh);
> diff --git a/fs/gfs2/trace_gfs2.h b/fs/gfs2/trace_gfs2.h
> index 1b8b815..a25c252 100644
> --- a/fs/gfs2/trace_gfs2.h
> +++ b/fs/gfs2/trace_gfs2.h
> @@ -14,6 +14,7 @@
>  #include <linux/ktime.h>
>  #include "incore.h"
>  #include "glock.h"
> +#include "rgrp.h"
>  
>  #define dlm_state_name(nn) { DLM_LOCK_##nn, #nn }
>  #define glock_trace_name(x) __print_symbolic(x,		\
> @@ -31,6 +32,17 @@
>  			    { GFS2_BLKST_DINODE, "dinode" },	\
>  			    { GFS2_BLKST_UNLINKED, "unlinked" })
>  
> +#define TRACE_RS_DELETE  0
> +#define TRACE_RS_TREEDEL 1
> +#define TRACE_RS_INSERT  2
> +#define TRACE_RS_CLAIM   3
> +
> +#define rs_func_name(x) __print_symbolic(x,	\
> +					 { 0, "del " },	\
> +					 { 1, "tdel" },	\
> +					 { 2, "ins " },	\
> +					 { 3, "clm " })
> +
>  #define show_glock_flags(flags) __print_flags(flags, "",	\
>  	{(1UL << GLF_LOCK),			"l" },		\
>  	{(1UL << GLF_DEMOTE),			"D" },		\
> @@ -470,6 +482,7 @@ TRACE_EVENT(gfs2_block_alloc,
>  		__field(	u8,	block_state		)
>  		__field(        u64,	rd_addr			)
>  		__field(        u32,	rd_free_clone		)
> +		__field(	u32,	rd_reserved		)
>  	),
>  
>  	TP_fast_assign(
> @@ -480,16 +493,58 @@ TRACE_EVENT(gfs2_block_alloc,
>  		__entry->block_state	= block_state;
>  		__entry->rd_addr	= rgd->rd_addr;
>  		__entry->rd_free_clone	= rgd->rd_free_clone;
> +		__entry->rd_reserved	= rgd->rd_reserved;
>  	),
>  
> -	TP_printk("%u,%u bmap %llu alloc %llu/%lu %s rg:%llu rf:%u",
> +	TP_printk("%u,%u bmap %llu alloc %llu/%lu %s rg:%llu rf:%u rr:%lu",
>  		  MAJOR(__entry->dev), MINOR(__entry->dev),
>  		  (unsigned long long)__entry->inum,
>  		  (unsigned long long)__entry->start,
>  		  (unsigned long)__entry->len,
>  		  block_state_name(__entry->block_state),
>  		  (unsigned long long)__entry->rd_addr,
> -		  __entry->rd_free_clone)
> +		  __entry->rd_free_clone, (unsigned long)__entry->rd_reserved)
> +);
> +
> +/* Keep track of multi-block reservations as they are allocated/freed */
> +TRACE_EVENT(gfs2_rs,
> +
> +	TP_PROTO(const struct gfs2_inode *ip, const struct gfs2_blkreserv *rs,
> +		 u8 func),
> +
> +	TP_ARGS(ip, rs, func),
> +
> +	TP_STRUCT__entry(
> +		__field(        dev_t,  dev                     )
> +		__field(	u64,	rd_addr			)
> +		__field(	u32,	rd_free_clone		)
> +		__field(	u32,	rd_reserved		)
> +		__field(	u64,	inum			)
> +		__field(	u64,	start			)
> +		__field(	u32,	free			)
> +		__field(	u8,	func			)
> +	),
> +
> +	TP_fast_assign(
> +		__entry->dev		= rs->rs_rgd ? rs->rs_rgd->rd_sbd->sd_vfs->s_dev : 0;
> +		__entry->rd_addr	= rs->rs_rgd ? rs->rs_rgd->rd_addr : 0;
> +		__entry->rd_free_clone	= rs->rs_rgd ? rs->rs_rgd->rd_free_clone : 0;
> +		__entry->rd_reserved	= rs->rs_rgd ? rs->rs_rgd->rd_reserved : 0;
> +		__entry->inum		= ip ? ip->i_no_addr : 0;
> +		__entry->start		= gfs2_rs_startblk(rs);
> +		__entry->free		= rs->rs_free;
> +		__entry->func		= func;
> +	),
> +
> +	TP_printk("%u,%u bmap %llu resrv %llu rg:%llu rf:%lu rr:%lu %s "
> +		  "f:%lu",
> +		  MAJOR(__entry->dev), MINOR(__entry->dev),
> +		  (unsigned long long)__entry->inum,
> +		  (unsigned long long)__entry->start,
> +		  (unsigned long long)__entry->rd_addr,
> +		  (unsigned long)__entry->rd_free_clone,
> +		  (unsigned long)__entry->rd_reserved,
> +		  rs_func_name(__entry->func), (unsigned long)__entry->free)
>  );
>  
>  #endif /* _TRACE_GFS2_H */
> diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
> index 523c0de..27a0b4a 100644
> --- a/fs/gfs2/xattr.c
> +++ b/fs/gfs2/xattr.c
> @@ -327,6 +327,10 @@ static int ea_remove_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh,
>  {
>  	int error;
>  
> +	error = gfs2_rindex_update(GFS2_SB(&ip->i_inode));
> +	if (error)
> +		return error;
> +
>  	error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
>  	if (error)
>  		goto out_alloc;
> @@ -710,6 +714,10 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er,
>  	struct buffer_head *dibh;
>  	int error;
>  
> +	error = gfs2_rindex_update(GFS2_SB(&ip->i_inode));
> +	if (error)
> +		return error;
> +
>  	error = gfs2_quota_lock_check(ip);
>  	if (error)
>  		return error;
> @@ -1483,6 +1491,10 @@ int gfs2_ea_dealloc(struct gfs2_inode *ip)
>  {
>  	int error;
>  
> +	error = gfs2_rindex_update(GFS2_SB(&ip->i_inode));
> +	if (error)
> +		return error;
> +
>  	error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
>  	if (error)
>  		return error;



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]