[Cluster-devel] [GFS2 PATCH] GFS2: Break ordered_write list by rgrp for faster sorting
Steven Whitehouse
swhiteho at redhat.com
Fri Aug 10 11:19:42 UTC 2012
Hi,
While it makes sense to accumulate the ordered write buffers on per-rgrp
lists so that they are partially pre-sorted, I'm less convinced that
there is a need to retain a per-rgrp "writing" list, since we must
traverse all of those items anyway. By keeping that as a global list we
then do not have the added complication of traversing the resource
groups. This might be a considerable win on a large filesystem with only
a very few dirty data blocks.
Also, it would make much more sense to use a per-rgrp lock on the
per-rgrp ordered list in order to get the most benefit from this change,
Steve.
On Thu, 2012-08-09 at 15:18 -0400, Bob Peterson wrote:
> Hi,
>
> This patch moves the ordered_write buffer list from the superblock
> to the rgrps. That makes for several lists that are smaller, and
> therefore faster to find and sort.
>
> Regards,
>
> Bob Peterson
> Red Hat File Systems
>
> Signed-off-by: Bob Peterson <rpeterso at redhat.com>
> ---
> diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
> index 01c4975..3708e39 100644
> --- a/fs/gfs2/aops.c
> +++ b/fs/gfs2/aops.c
> @@ -1083,8 +1083,10 @@ int gfs2_releasepage(struct page *page, gfp_t gfp_mask)
> bh->b_private = NULL;
> }
> gfs2_log_unlock(sdp);
> - if (bd)
> + if (bd) {
> + BUG_ON(!list_empty(&bd->bd_list));
> kmem_cache_free(gfs2_bufdata_cachep, bd);
> + }
>
> bh = bh->b_this_page;
> } while (bh != head);
> diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
> index 99d7c64..d380152 100644
> --- a/fs/gfs2/incore.h
> +++ b/fs/gfs2/incore.h
> @@ -97,6 +97,8 @@ struct gfs2_rgrpd {
> #define GFS2_RDF_UPTODATE 0x20000000 /* rg is up to date */
> #define GFS2_RDF_ERROR 0x40000000 /* error in rg */
> #define GFS2_RDF_MASK 0xf0000000 /* mask for internal flags */
> + struct list_head rd_log_le_ordered;
> + struct list_head rd_log_le_writing;
> spinlock_t rd_rsspin; /* protects reservation related vars */
> struct rb_root rd_rstree; /* multi-block reservation tree */
> u32 rd_rs_cnt; /* count of current reservations */
> @@ -723,7 +725,6 @@ struct gfs2_sbd {
> struct list_head sd_log_le_buf;
> struct list_head sd_log_le_revoke;
> struct list_head sd_log_le_databuf;
> - struct list_head sd_log_le_ordered;
>
> atomic_t sd_log_thresh1;
> atomic_t sd_log_thresh2;
> diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
> index f4beeb9..ce3e86d 100644
> --- a/fs/gfs2/log.c
> +++ b/fs/gfs2/log.c
> @@ -28,6 +28,7 @@
> #include "log.h"
> #include "lops.h"
> #include "meta_io.h"
> +#include "rgrp.h"
> #include "util.h"
> #include "dir.h"
> #include "trace_gfs2.h"
> @@ -500,29 +501,38 @@ static void gfs2_ordered_write(struct gfs2_sbd *sdp)
> {
> struct gfs2_bufdata *bd;
> struct buffer_head *bh;
> - LIST_HEAD(written);
> + struct gfs2_rgrpd *rgd, *rg1;
>
> gfs2_log_lock(sdp);
> - list_sort(NULL, &sdp->sd_log_le_ordered, &bd_cmp);
> - while (!list_empty(&sdp->sd_log_le_ordered)) {
> - bd = list_entry(sdp->sd_log_le_ordered.next, struct gfs2_bufdata, bd_list);
> - list_move(&bd->bd_list, &written);
> - bh = bd->bd_bh;
> - if (!buffer_dirty(bh))
> - continue;
> - get_bh(bh);
> - gfs2_log_unlock(sdp);
> - lock_buffer(bh);
> - if (buffer_mapped(bh) && test_clear_buffer_dirty(bh)) {
> - bh->b_end_io = end_buffer_write_sync;
> - submit_bh(WRITE_SYNC, bh);
> - } else {
> - unlock_buffer(bh);
> - brelse(bh);
> + rgd = rg1 = gfs2_rgrpd_get_first(sdp);
> + while (rgd) {
> + if (!list_empty(&rgd->rd_log_le_ordered) &&
> + !list_is_last(rgd->rd_log_le_ordered.next,
> + &rgd->rd_log_le_ordered))
> + list_sort(NULL, &rgd->rd_log_le_ordered, &bd_cmp);
> + while (!list_empty(&rgd->rd_log_le_ordered)) {
> + bd = list_entry(rgd->rd_log_le_ordered.next,
> + struct gfs2_bufdata, bd_list);
> + list_move(&bd->bd_list, &rgd->rd_log_le_writing);
> + bh = bd->bd_bh;
> + if (!buffer_dirty(bh))
> + continue;
> + get_bh(bh);
> + gfs2_log_unlock(sdp);
> + lock_buffer(bh);
> + if (buffer_mapped(bh) && test_clear_buffer_dirty(bh)) {
> + bh->b_end_io = end_buffer_write_sync;
> + submit_bh(WRITE_SYNC, bh);
> + } else {
> + unlock_buffer(bh);
> + brelse(bh);
> + }
> + gfs2_log_lock(sdp);
> }
> - gfs2_log_lock(sdp);
> + rgd = gfs2_rgrpd_get_next(rgd);
> + if (rgd == rg1)
> + break;
> }
> - list_splice(&written, &sdp->sd_log_le_ordered);
> gfs2_log_unlock(sdp);
> }
>
> @@ -530,20 +540,28 @@ static void gfs2_ordered_wait(struct gfs2_sbd *sdp)
> {
> struct gfs2_bufdata *bd;
> struct buffer_head *bh;
> + struct gfs2_rgrpd *rgd, *rg1;
>
> gfs2_log_lock(sdp);
> - while (!list_empty(&sdp->sd_log_le_ordered)) {
> - bd = list_entry(sdp->sd_log_le_ordered.prev, struct gfs2_bufdata, bd_list);
> - bh = bd->bd_bh;
> - if (buffer_locked(bh)) {
> - get_bh(bh);
> - gfs2_log_unlock(sdp);
> - wait_on_buffer(bh);
> - brelse(bh);
> - gfs2_log_lock(sdp);
> - continue;
> + rgd = rg1 = gfs2_rgrpd_get_first(sdp);
> + while (rgd) {
> + while (!list_empty(&rgd->rd_log_le_writing)) {
> + bd = list_entry(rgd->rd_log_le_writing.prev,
> + struct gfs2_bufdata, bd_list);
> + list_del_init(&bd->bd_list);
> + bh = bd->bd_bh;
> + if (bh && buffer_locked(bh)) {
> + get_bh(bh);
> + gfs2_log_unlock(sdp);
> + wait_on_buffer(bh);
> + brelse(bh);
> + gfs2_log_lock(sdp);
> + continue;
> + }
> }
> - list_del_init(&bd->bd_list);
> + rgd = gfs2_rgrpd_get_next(rgd);
> + if (rgd == rg1)
> + break;
> }
> gfs2_log_unlock(sdp);
> }
> diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
> index 8ff95a2..39c483e 100644
> --- a/fs/gfs2/lops.c
> +++ b/fs/gfs2/lops.c
> @@ -672,6 +672,7 @@ static void revoke_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
> gl = bd->bd_gl;
> atomic_dec(&gl->gl_revokes);
> clear_bit(GLF_LFLUSH, &gl->gl_flags);
> + BUG_ON(!list_empty(&bd->bd_list));
> kmem_cache_free(gfs2_bufdata_cachep, bd);
> }
> }
> @@ -776,6 +777,7 @@ static void databuf_lo_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
> struct gfs2_trans *tr = current->journal_info;
> struct address_space *mapping = bd->bd_bh->b_page->mapping;
> struct gfs2_inode *ip = GFS2_I(mapping->host);
> + struct gfs2_rgrpd *rgd;
>
> lock_buffer(bd->bd_bh);
> gfs2_log_lock(sdp);
> @@ -791,7 +793,13 @@ static void databuf_lo_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
> sdp->sd_log_num_databuf++;
> list_add_tail(&bd->bd_list, &sdp->sd_log_le_databuf);
> } else {
> - list_add_tail(&bd->bd_list, &sdp->sd_log_le_ordered);
> + if (ip->i_rgd &&
> + rgrp_contains_block(ip->i_rgd, bd->bd_bh->b_blocknr))
> + rgd = ip->i_rgd;
> + else
> + rgd = gfs2_blk2rgrpd(sdp, bd->bd_bh->b_blocknr, 1);
> + BUG_ON(rgd == NULL);
> + list_add_tail(&bd->bd_list, &rgd->rd_log_le_ordered);
> }
> out:
> gfs2_log_unlock(sdp);
> diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
> index e5af9dc..6b72d07 100644
> --- a/fs/gfs2/ops_fstype.c
> +++ b/fs/gfs2/ops_fstype.c
> @@ -100,7 +100,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
> INIT_LIST_HEAD(&sdp->sd_log_le_buf);
> INIT_LIST_HEAD(&sdp->sd_log_le_revoke);
> INIT_LIST_HEAD(&sdp->sd_log_le_databuf);
> - INIT_LIST_HEAD(&sdp->sd_log_le_ordered);
>
> init_waitqueue_head(&sdp->sd_log_waitq);
> init_waitqueue_head(&sdp->sd_logd_waitq);
> diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
> index 47d2346..f9b2baf 100644
> --- a/fs/gfs2/rgrp.c
> +++ b/fs/gfs2/rgrp.c
> @@ -331,13 +331,6 @@ void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd)
> }
> }
>
> -static inline int rgrp_contains_block(struct gfs2_rgrpd *rgd, u64 block)
> -{
> - u64 first = rgd->rd_data0;
> - u64 last = first + rgd->rd_data;
> - return first <= block && block < last;
> -}
> -
> /**
> * gfs2_blk2rgrpd - Find resource group for a given data/meta block number
> * @sdp: The GFS2 superblock
> @@ -754,6 +747,8 @@ static int read_rindex_entry(struct gfs2_inode *ip)
> rgd->rd_data0 = be64_to_cpu(buf.ri_data0);
> rgd->rd_data = be32_to_cpu(buf.ri_data);
> rgd->rd_bitbytes = be32_to_cpu(buf.ri_bitbytes);
> + INIT_LIST_HEAD(&rgd->rd_log_le_ordered);
> + INIT_LIST_HEAD(&rgd->rd_log_le_writing);
> spin_lock_init(&rgd->rd_rsspin);
>
> error = compute_bitstructs(rgd);
> diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
> index c98f6af..a379320 100644
> --- a/fs/gfs2/rgrp.h
> +++ b/fs/gfs2/rgrp.h
> @@ -73,6 +73,13 @@ extern int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
> const struct gfs2_bitmap *bi, unsigned minlen, u64 *ptrimmed);
> extern int gfs2_fitrim(struct file *filp, void __user *argp);
>
> +static inline int rgrp_contains_block(struct gfs2_rgrpd *rgd, u64 block)
> +{
> + u64 first = rgd->rd_data0;
> + u64 last = first + rgd->rd_data;
> + return first <= block && block < last;
> +}
> +
> /* This is how to tell if a reservation is in the rgrp tree: */
> static inline bool gfs2_rs_active(struct gfs2_blkreserv *rs)
> {
> diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
> index adbd278..724f724 100644
> --- a/fs/gfs2/trans.c
> +++ b/fs/gfs2/trans.c
> @@ -186,6 +186,7 @@ void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len)
> list_del_init(&bd->bd_list);
> gfs2_assert_withdraw(sdp, sdp->sd_log_num_revoke);
> sdp->sd_log_num_revoke--;
> + BUG_ON(!list_empty(&bd->bd_list));
> kmem_cache_free(gfs2_bufdata_cachep, bd);
> tr->tr_num_revoke_rm++;
> if (--n == 0)
>
More information about the Cluster-devel
mailing list