[Cluster-devel] [GFS2 Patch] GFS2: Add readahead to sequential directory traversal

Steven Whitehouse swhiteho at redhat.com
Sat Oct 8 11:13:00 UTC 2011


Hi,

On Fri, 2011-10-07 at 12:01 -0400, Bob Peterson wrote:
> Hi,
> 
> Thanks for the comments, Steve.  Here is another version
> that shows the same performance benefit (better, actually).
> It's much simpler than the previous one.  Instead of keeping
> a bitmap, it simply uses a u32 in the gfs2_inode to keep
> track of where it's last read-ahead.  That avoids a lot of
> the issues you wrote about.
> 
> I couldn't use the file struct because of the way function
> gfs2_dir_read is called from NFS.
> 
Thanks for fixing that up... it looks much simpler now. Its a pity about
the NFS issue. We still need to figure out how to correctly reset the
readahead index correctly though, but I think we can leave that for a
future patch. One possible solution would be to reset it on
lseek(SEEK_SET, 0) for example, or if the readahead index is miles away
from the actual index,

Steve.

> Regards,
> 
> Bob Peterson
> Red Hat File Systems
> 
> Signed-off-by: Bob Peterson <rpeterso at redhat.com> 
> --
>  fs/gfs2/dir.c    |   50 ++++++++++++++++++++++++++++++++++++++++++++++++++
>  fs/gfs2/incore.h |    1 +
>  2 files changed, 51 insertions(+), 0 deletions(-)
> 
> diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
> index 2045d70..31888bd 100644
> --- a/fs/gfs2/dir.c
> +++ b/fs/gfs2/dir.c
> @@ -76,6 +76,8 @@
>  #define IS_LEAF     1 /* Hashed (leaf) directory */
>  #define IS_DINODE   2 /* Linear (stuffed dinode block) directory */
>  
> +#define MAX_RA_BLOCKS 32 /* max read-ahead blocks */
> +
>  #define gfs2_disk_hash2offset(h) (((u64)(h)) >> 1)
>  #define gfs2_dir_offset2hash(p) ((u32)(((u64)(p)) << 1))
>  
> @@ -345,6 +347,7 @@ static __be64 *gfs2_dir_get_hash_table(struct gfs2_inode *ip)
>  	if (hc)
>  		return hc;
>  
> +	ip->i_ra_index = 0;
>  	hsize = 1 << ip->i_depth;
>  	hsize *= sizeof(__be64);
>  	if (hsize != i_size_read(&ip->i_inode)) {
> @@ -382,6 +385,7 @@ static __be64 *gfs2_dir_get_hash_table(struct gfs2_inode *ip)
>  void gfs2_dir_hash_inval(struct gfs2_inode *ip)
>  {
>  	__be64 *hc = ip->i_hash_cache;
> +	ip->i_ra_index = 0;
>  	ip->i_hash_cache = NULL;
>  	kfree(hc);
>  }
> @@ -1377,6 +1381,50 @@ out:
>  }
>  
> 
> +/* gfs2_dir_readahead - Issue read-ahead requests for leaf blocks.
> + *
> + * Note: we can't calculate each index like dir_e_read can because we don't
> + * have the leaf, and therefore we don't have the depth, and therefore we
> + * don't have the length. So we have to just read enough ahead to make up
> + * for the loss of information. */
> +static void gfs2_dir_readahead(struct inode *inode, unsigned hsize, u32 index)
> +{
> +	struct gfs2_inode *ip = GFS2_I(inode);
> +	struct gfs2_glock *gl = ip->i_gl;
> +	struct buffer_head *bh;
> +	u64 blocknr = 0, last;
> +	unsigned count;
> +
> +	/* First check if we've already read-ahead for the whole range. */
> +	if (index + MAX_RA_BLOCKS < ip->i_ra_index)
> +		return;
> +
> +	ip->i_ra_index = max(index, ip->i_ra_index);
> +	for (count = 0; count < MAX_RA_BLOCKS; count++) {
> +		if (ip->i_ra_index >= hsize) /* if exceeded the hash table */
> +			break;
> +
> +		last = blocknr;
> +		blocknr = be64_to_cpu(ip->i_hash_cache[ip->i_ra_index]);
> +		ip->i_ra_index++;
> +		if (blocknr == last)
> +			continue;
> +
> +		bh = gfs2_getbuf(gl, blocknr, 1);
> +		if (trylock_buffer(bh)) {
> +			if (buffer_uptodate(bh)) {
> +				unlock_buffer(bh);
> +				brelse(bh);
> +				continue;
> +			}
> +			bh->b_end_io = end_buffer_read_sync;
> +			submit_bh(READA | REQ_META, bh);
> +			continue;
> +		}
> +		brelse(bh);
> +	}
> +}
> +
>  /**
>   * dir_e_read - Reads the entries from a directory into a filldir buffer
>   * @dip: dinode pointer
> @@ -1406,6 +1454,8 @@ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
>  	if (IS_ERR(lp))
>  		return PTR_ERR(lp);
>  
> +	gfs2_dir_readahead(inode, hsize, index);
> +
>  	while (index < hsize) {
>  		error = gfs2_dir_read_leaf(inode, offset, opaque, filldir,
>  					   &copied, &depth,
> diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
> index 892ac37..50c3bcb 100644
> --- a/fs/gfs2/incore.h
> +++ b/fs/gfs2/incore.h
> @@ -286,6 +286,7 @@ struct gfs2_inode {
>  	struct rw_semaphore i_rw_mutex;
>  	struct list_head i_trunc_list;
>  	__be64 *i_hash_cache;
> +	u32 i_ra_index; /* read-ahead index */
>  	u32 i_entries;
>  	u32 i_diskflags;
>  	u8 i_height;





More information about the Cluster-devel mailing list