[Cluster-devel] [GFS2 PATCH] GFS2: Break ordered_write list by rgrp for faster sorting

Steven Whitehouse swhiteho at redhat.com
Fri Aug 10 11:19:42 UTC 2012


Hi,

While it makes sense to accumulate the ordered write buffers on per-rgrp
lists so that they are partially pre-sorted, I'm less convinced that
there is a need to retain a per-rgrp "writing" list, since we must
traverse all of those items anyway. By keeping that as a global list we
then do not have the added complication of traversing the resource
groups. This might be a considerable win on a large filesystem with only
a very few dirty data blocks.

Also, it would make much more sense to use a per-rgrp lock on the
per-rgrp ordered list in order to get the most benefit from this change,

Steve.

On Thu, 2012-08-09 at 15:18 -0400, Bob Peterson wrote:
> Hi,
> 
> This patch moves the ordered_write buffer list from the superblock
> to the rgrps. That makes for several lists that are smaller, and
> therefore faster to find and sort.
> 
> Regards,
> 
> Bob Peterson
> Red Hat File Systems
> 
> Signed-off-by: Bob Peterson <rpeterso at redhat.com> 
> ---
> diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
> index 01c4975..3708e39 100644
> --- a/fs/gfs2/aops.c
> +++ b/fs/gfs2/aops.c
> @@ -1083,8 +1083,10 @@ int gfs2_releasepage(struct page *page, gfp_t gfp_mask)
>  			bh->b_private = NULL;
>  		}
>  		gfs2_log_unlock(sdp);
> -		if (bd)
> +		if (bd) {
> +			BUG_ON(!list_empty(&bd->bd_list));
>  			kmem_cache_free(gfs2_bufdata_cachep, bd);
> +		}
>  
>  		bh = bh->b_this_page;
>  	} while (bh != head);
> diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
> index 99d7c64..d380152 100644
> --- a/fs/gfs2/incore.h
> +++ b/fs/gfs2/incore.h
> @@ -97,6 +97,8 @@ struct gfs2_rgrpd {
>  #define GFS2_RDF_UPTODATE	0x20000000 /* rg is up to date */
>  #define GFS2_RDF_ERROR		0x40000000 /* error in rg */
>  #define GFS2_RDF_MASK		0xf0000000 /* mask for internal flags */
> +	struct list_head rd_log_le_ordered;
> +	struct list_head rd_log_le_writing;
>  	spinlock_t rd_rsspin;           /* protects reservation related vars */
>  	struct rb_root rd_rstree;       /* multi-block reservation tree */
>  	u32 rd_rs_cnt;                  /* count of current reservations */
> @@ -723,7 +725,6 @@ struct gfs2_sbd {
>  	struct list_head sd_log_le_buf;
>  	struct list_head sd_log_le_revoke;
>  	struct list_head sd_log_le_databuf;
> -	struct list_head sd_log_le_ordered;
>  
>  	atomic_t sd_log_thresh1;
>  	atomic_t sd_log_thresh2;
> diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
> index f4beeb9..ce3e86d 100644
> --- a/fs/gfs2/log.c
> +++ b/fs/gfs2/log.c
> @@ -28,6 +28,7 @@
>  #include "log.h"
>  #include "lops.h"
>  #include "meta_io.h"
> +#include "rgrp.h"
>  #include "util.h"
>  #include "dir.h"
>  #include "trace_gfs2.h"
> @@ -500,29 +501,38 @@ static void gfs2_ordered_write(struct gfs2_sbd *sdp)
>  {
>  	struct gfs2_bufdata *bd;
>  	struct buffer_head *bh;
> -	LIST_HEAD(written);
> +	struct gfs2_rgrpd *rgd, *rg1;
>  
>  	gfs2_log_lock(sdp);
> -	list_sort(NULL, &sdp->sd_log_le_ordered, &bd_cmp);
> -	while (!list_empty(&sdp->sd_log_le_ordered)) {
> -		bd = list_entry(sdp->sd_log_le_ordered.next, struct gfs2_bufdata, bd_list);
> -		list_move(&bd->bd_list, &written);
> -		bh = bd->bd_bh;
> -		if (!buffer_dirty(bh))
> -			continue;
> -		get_bh(bh);
> -		gfs2_log_unlock(sdp);
> -		lock_buffer(bh);
> -		if (buffer_mapped(bh) && test_clear_buffer_dirty(bh)) {
> -			bh->b_end_io = end_buffer_write_sync;
> -			submit_bh(WRITE_SYNC, bh);
> -		} else {
> -			unlock_buffer(bh);
> -			brelse(bh);
> +	rgd = rg1 = gfs2_rgrpd_get_first(sdp);
> +	while (rgd) {
> +		if (!list_empty(&rgd->rd_log_le_ordered) &&
> +		    !list_is_last(rgd->rd_log_le_ordered.next,
> +				  &rgd->rd_log_le_ordered))
> +			list_sort(NULL, &rgd->rd_log_le_ordered, &bd_cmp);
> +		while (!list_empty(&rgd->rd_log_le_ordered)) {
> +			bd = list_entry(rgd->rd_log_le_ordered.next,
> +					struct gfs2_bufdata, bd_list);
> +			list_move(&bd->bd_list, &rgd->rd_log_le_writing);
> +			bh = bd->bd_bh;
> +			if (!buffer_dirty(bh))
> +				continue;
> +			get_bh(bh);
> +			gfs2_log_unlock(sdp);
> +			lock_buffer(bh);
> +			if (buffer_mapped(bh) && test_clear_buffer_dirty(bh)) {
> +				bh->b_end_io = end_buffer_write_sync;
> +				submit_bh(WRITE_SYNC, bh);
> +			} else {
> +				unlock_buffer(bh);
> +				brelse(bh);
> +			}
> +			gfs2_log_lock(sdp);
>  		}
> -		gfs2_log_lock(sdp);
> +		rgd = gfs2_rgrpd_get_next(rgd);
> +		if (rgd == rg1)
> +			break;
>  	}
> -	list_splice(&written, &sdp->sd_log_le_ordered);
>  	gfs2_log_unlock(sdp);
>  }
>  
> @@ -530,20 +540,28 @@ static void gfs2_ordered_wait(struct gfs2_sbd *sdp)
>  {
>  	struct gfs2_bufdata *bd;
>  	struct buffer_head *bh;
> +	struct gfs2_rgrpd *rgd, *rg1;
>  
>  	gfs2_log_lock(sdp);
> -	while (!list_empty(&sdp->sd_log_le_ordered)) {
> -		bd = list_entry(sdp->sd_log_le_ordered.prev, struct gfs2_bufdata, bd_list);
> -		bh = bd->bd_bh;
> -		if (buffer_locked(bh)) {
> -			get_bh(bh);
> -			gfs2_log_unlock(sdp);
> -			wait_on_buffer(bh);
> -			brelse(bh);
> -			gfs2_log_lock(sdp);
> -			continue;
> +	rgd = rg1 = gfs2_rgrpd_get_first(sdp);
> +	while (rgd) {
> +		while (!list_empty(&rgd->rd_log_le_writing)) {
> +			bd = list_entry(rgd->rd_log_le_writing.prev,
> +					struct gfs2_bufdata, bd_list);
> +			list_del_init(&bd->bd_list);
> +			bh = bd->bd_bh;
> +			if (bh && buffer_locked(bh)) {
> +				get_bh(bh);
> +				gfs2_log_unlock(sdp);
> +				wait_on_buffer(bh);
> +				brelse(bh);
> +				gfs2_log_lock(sdp);
> +				continue;
> +			}
>  		}
> -		list_del_init(&bd->bd_list);
> +		rgd = gfs2_rgrpd_get_next(rgd);
> +		if (rgd == rg1)
> +			break;
>  	}
>  	gfs2_log_unlock(sdp);
>  }
> diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
> index 8ff95a2..39c483e 100644
> --- a/fs/gfs2/lops.c
> +++ b/fs/gfs2/lops.c
> @@ -672,6 +672,7 @@ static void revoke_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
>  		gl = bd->bd_gl;
>  		atomic_dec(&gl->gl_revokes);
>  		clear_bit(GLF_LFLUSH, &gl->gl_flags);
> +		BUG_ON(!list_empty(&bd->bd_list));
>  		kmem_cache_free(gfs2_bufdata_cachep, bd);
>  	}
>  }
> @@ -776,6 +777,7 @@ static void databuf_lo_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
>  	struct gfs2_trans *tr = current->journal_info;
>  	struct address_space *mapping = bd->bd_bh->b_page->mapping;
>  	struct gfs2_inode *ip = GFS2_I(mapping->host);
> +	struct gfs2_rgrpd *rgd;
>  
>  	lock_buffer(bd->bd_bh);
>  	gfs2_log_lock(sdp);
> @@ -791,7 +793,13 @@ static void databuf_lo_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
>  		sdp->sd_log_num_databuf++;
>  		list_add_tail(&bd->bd_list, &sdp->sd_log_le_databuf);
>  	} else {
> -		list_add_tail(&bd->bd_list, &sdp->sd_log_le_ordered);
> +		if (ip->i_rgd &&
> +		    rgrp_contains_block(ip->i_rgd, bd->bd_bh->b_blocknr))
> +			rgd = ip->i_rgd;
> +		else
> +			rgd = gfs2_blk2rgrpd(sdp, bd->bd_bh->b_blocknr, 1);
> +		BUG_ON(rgd == NULL);
> +		list_add_tail(&bd->bd_list, &rgd->rd_log_le_ordered);
>  	}
>  out:
>  	gfs2_log_unlock(sdp);
> diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
> index e5af9dc..6b72d07 100644
> --- a/fs/gfs2/ops_fstype.c
> +++ b/fs/gfs2/ops_fstype.c
> @@ -100,7 +100,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
>  	INIT_LIST_HEAD(&sdp->sd_log_le_buf);
>  	INIT_LIST_HEAD(&sdp->sd_log_le_revoke);
>  	INIT_LIST_HEAD(&sdp->sd_log_le_databuf);
> -	INIT_LIST_HEAD(&sdp->sd_log_le_ordered);
>  
>  	init_waitqueue_head(&sdp->sd_log_waitq);
>  	init_waitqueue_head(&sdp->sd_logd_waitq);
> diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
> index 47d2346..f9b2baf 100644
> --- a/fs/gfs2/rgrp.c
> +++ b/fs/gfs2/rgrp.c
> @@ -331,13 +331,6 @@ void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd)
>  	}
>  }
>  
> -static inline int rgrp_contains_block(struct gfs2_rgrpd *rgd, u64 block)
> -{
> -	u64 first = rgd->rd_data0;
> -	u64 last = first + rgd->rd_data;
> -	return first <= block && block < last;
> -}
> -
>  /**
>   * gfs2_blk2rgrpd - Find resource group for a given data/meta block number
>   * @sdp: The GFS2 superblock
> @@ -754,6 +747,8 @@ static int read_rindex_entry(struct gfs2_inode *ip)
>  	rgd->rd_data0 = be64_to_cpu(buf.ri_data0);
>  	rgd->rd_data = be32_to_cpu(buf.ri_data);
>  	rgd->rd_bitbytes = be32_to_cpu(buf.ri_bitbytes);
> +	INIT_LIST_HEAD(&rgd->rd_log_le_ordered);
> +	INIT_LIST_HEAD(&rgd->rd_log_le_writing);
>  	spin_lock_init(&rgd->rd_rsspin);
>  
>  	error = compute_bitstructs(rgd);
> diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
> index c98f6af..a379320 100644
> --- a/fs/gfs2/rgrp.h
> +++ b/fs/gfs2/rgrp.h
> @@ -73,6 +73,13 @@ extern int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
>  				   const struct gfs2_bitmap *bi, unsigned minlen, u64 *ptrimmed);
>  extern int gfs2_fitrim(struct file *filp, void __user *argp);
>  
> +static inline int rgrp_contains_block(struct gfs2_rgrpd *rgd, u64 block)
> +{
> +	u64 first = rgd->rd_data0;
> +	u64 last = first + rgd->rd_data;
> +	return first <= block && block < last;
> +}
> +
>  /* This is how to tell if a reservation is in the rgrp tree: */
>  static inline bool gfs2_rs_active(struct gfs2_blkreserv *rs)
>  {
> diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
> index adbd278..724f724 100644
> --- a/fs/gfs2/trans.c
> +++ b/fs/gfs2/trans.c
> @@ -186,6 +186,7 @@ void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len)
>  			list_del_init(&bd->bd_list);
>  			gfs2_assert_withdraw(sdp, sdp->sd_log_num_revoke);
>  			sdp->sd_log_num_revoke--;
> +			BUG_ON(!list_empty(&bd->bd_list));
>  			kmem_cache_free(gfs2_bufdata_cachep, bd);
>  			tr->tr_num_revoke_rm++;
>  			if (--n == 0)
> 





More information about the Cluster-devel mailing list