[Cluster-devel] [GFS2 Patch] GFS2: Add readahead to sequential directory traversal

Mon Oct 24 08:17:35 UTC 2011

Hi,

That looks much better, but why not create & pass a file_ra_state in the
NFS getname function so that you don't have to check for it being NULL?
Since that function always reads everything up until the name it is
looking for, it is an ideal place to do readahead as the access will
always be sequential.

Otherwise, it looks good to me,

Steve.

On Fri, 2011-10-21 at 12:53 -0400, Bob Peterson wrote:
> ----- Original Message -----
> | Well it is not called directly from NFS though, only from our getname
> | function. I'd suggest we do something along the following lines:
> | 
> | Create a new structure, something like this:
> | 
> | struct gfs2_dir_ra_state {
> | 	/* Readahead state variables... */
> | };
> | 
> | and then pass that to gfs2_dir_read() in addition to passing the
> | offset.
> | The getname call is trivial, since it always starts at the beginning
> | and
> | reads through sequentially, so it just needs a gfs2_dir_ra_state to
> | be
> | set up to take account of that. The struct gfs2_dir_ra_state can be
> | embedded into the struct gfs2_file, so that gfs2_readdir() just
> | passes a
> | pointer to that.
> | 
> | That will allow us to reset the readahead state upon lseek(SEEK_SET,
> | 0);
> | and also flag whether the accesses become non-sequential.
> | 
> | So I think keeping the state in the struct file and passing it to our
> | gfs2_file_read() function should not be too tricky,
> | 
> | Steve.
> 
> Hi Steve,
> 
> Thanks for the feedback.  How about something like this (follows):
> Instead of creating a new read-ahead structure gfs2_dir_ra_state
> I'm making (non-standard) use of vfs's read-ahead struct in the
> file.
> 
> Regards,
> 
> Bob Peterson
> Red Hat File System
> --
> commit 14e64999c89c7e2801d69b9e877f2fa0bf10ad70
> Author: Bob Peterson <rpeterso at redhat.com>
> Date:   Fri Oct 7 11:55:31 2011 -0500
> 
>     GFS2: Add readahead to sequential directory traversal
>     
>     This patch adds read-ahead capability to GFS2's
>     directory hash table management.  It greatly improves
>     performance for some directory operations.  For example:
>     In one of my file systems that has 1000 directories, each
>     of which has 1000 files, time to execute a recursive
>     ls (time ls -fR /mnt/gfs2 > /dev/null) was reduced
>     from 2m2.814s on a stock kernel to 0m45.938s.
> 
> diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
> index 2045d70..30405d4 100644
> --- a/fs/gfs2/dir.c
> +++ b/fs/gfs2/dir.c
> @@ -76,6 +76,8 @@
>  #define IS_LEAF     1 /* Hashed (leaf) directory */
>  #define IS_DINODE   2 /* Linear (stuffed dinode block) directory */
>  
> +#define MAX_RA_BLOCKS 32 /* max read-ahead blocks */
> +
>  #define gfs2_disk_hash2offset(h) (((u64)(h)) >> 1)
>  #define gfs2_dir_offset2hash(p) ((u32)(((u64)(p)) << 1))
>  
> @@ -1376,6 +1378,50 @@ out:
>  	return error;
>  }
>  
> +/* gfs2_dir_readahead - Issue read-ahead requests for leaf blocks.
> + *
> + * Note: we can't calculate each index like dir_e_read can because we don't
> + * have the leaf, and therefore we don't have the depth, and therefore we
> + * don't have the length. So we have to just read enough ahead to make up
> + * for the loss of information. */
> +static void gfs2_dir_readahead(struct inode *inode, unsigned hsize, u32 index,
> +			       struct file_ra_state *f_ra)
> +{
> +	struct gfs2_inode *ip = GFS2_I(inode);
> +	struct gfs2_glock *gl = ip->i_gl;
> +	struct buffer_head *bh;
> +	u64 blocknr = 0, last;
> +	unsigned count;
> +
> +	/* First check if we've already read-ahead for the whole range. */
> +	if (!f_ra || index + MAX_RA_BLOCKS < f_ra->start)
> +		return;
> +
> +	f_ra->start = max((pgoff_t)index, f_ra->start);
> +	for (count = 0; count < MAX_RA_BLOCKS; count++) {
> +		if (f_ra->start >= hsize) /* if exceeded the hash table */
> +			break;
> +
> +		last = blocknr;
> +		blocknr = be64_to_cpu(ip->i_hash_cache[f_ra->start]);
> +		f_ra->start++;
> +		if (blocknr == last)
> +			continue;
> +
> +		bh = gfs2_getbuf(gl, blocknr, 1);
> +		if (trylock_buffer(bh)) {
> +			if (buffer_uptodate(bh)) {
> +				unlock_buffer(bh);
> +				brelse(bh);
> +				continue;
> +			}
> +			bh->b_end_io = end_buffer_read_sync;
> +			submit_bh(READA | REQ_META, bh);
> +			continue;
> +		}
> +		brelse(bh);
> +	}
> +}
>  
>  /**
>   * dir_e_read - Reads the entries from a directory into a filldir buffer
> @@ -1388,7 +1434,7 @@ out:
>   */
>  
>  static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
> -		      filldir_t filldir)
> +		      filldir_t filldir, struct file_ra_state *f_ra)
>  {
>  	struct gfs2_inode *dip = GFS2_I(inode);
>  	u32 hsize, len = 0;
> @@ -1402,10 +1448,14 @@ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
>  	hash = gfs2_dir_offset2hash(*offset);
>  	index = hash >> (32 - dip->i_depth);
>  
> +	if (f_ra && dip->i_hash_cache == NULL)
> +		f_ra->start = 0;
>  	lp = gfs2_dir_get_hash_table(dip);
>  	if (IS_ERR(lp))
>  		return PTR_ERR(lp);
>  
> +	gfs2_dir_readahead(inode, hsize, index, f_ra);
> +
>  	while (index < hsize) {
>  		error = gfs2_dir_read_leaf(inode, offset, opaque, filldir,
>  					   &copied, &depth,
> @@ -1423,7 +1473,7 @@ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
>  }
>  
>  int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
> -		  filldir_t filldir)
> +		  filldir_t filldir, struct file_ra_state *f_ra)
>  {
>  	struct gfs2_inode *dip = GFS2_I(inode);
>  	struct gfs2_sbd *sdp = GFS2_SB(inode);
> @@ -1437,7 +1487,7 @@ int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
>  		return 0;
>  
>  	if (dip->i_diskflags & GFS2_DIF_EXHASH)
> -		return dir_e_read(inode, offset, opaque, filldir);
> +		return dir_e_read(inode, offset, opaque, filldir, f_ra);
>  
>  	if (!gfs2_is_stuffed(dip)) {
>  		gfs2_consist_inode(dip);
> diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h
> index ff5772f..98c960b 100644
> --- a/fs/gfs2/dir.h
> +++ b/fs/gfs2/dir.h
> @@ -25,7 +25,7 @@ extern int gfs2_dir_add(struct inode *inode, const struct qstr *filename,
>  			const struct gfs2_inode *ip);
>  extern int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry);
>  extern int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
> -			 filldir_t filldir);
> +			 filldir_t filldir, struct file_ra_state *f_ra);
>  extern int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
>  			  const struct gfs2_inode *nip, unsigned int new_type);
>  
> diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
> index fe9945f..a581cd2 100644
> --- a/fs/gfs2/export.c
> +++ b/fs/gfs2/export.c
> @@ -118,7 +118,7 @@ static int gfs2_get_name(struct dentry *parent, char *name,
>  	if (error)
>  		return error;
>  
> -	error = gfs2_dir_read(dir, &offset, &gnfd, get_name_filldir);
> +	error = gfs2_dir_read(dir, &offset, &gnfd, get_name_filldir, NULL);
>  
>  	gfs2_glock_dq_uninit(&gh);
>  
> diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
> index 92c3db4..c9cbb5f 100644
> --- a/fs/gfs2/file.c
> +++ b/fs/gfs2/file.c
> @@ -96,7 +96,7 @@ static int gfs2_readdir(struct file *file, void *dirent, filldir_t filldir)
>  		return error;
>  	}
>  
> -	error = gfs2_dir_read(dir, &offset, dirent, filldir);
> +	error = gfs2_dir_read(dir, &offset, dirent, filldir, &file->f_ra);
>  
>  	gfs2_glock_dq_uninit(&d_gh);
>