[Cluster-devel] [GFS2 Patch] GFS2: Add readahead to sequential directory traversal

Thu Oct 6 16:15:12 UTC 2011

----- Original Message -----
| Hi,
| 
| Were we not intending to make this turn itself off in cases where it
| produces no benefit? I thought the plan was to track the state via
| the
| file descriptor in order to avoid readingahead the same blocks over
| and
| over again too,
| 
| Steve.

Hi Steve,

Based on our discussion, I revised the patch to implement a
scheme whereby we don't duplicate read-ahead requests.  However,
I decided against using the file descriptor in favor of a
simple bitmap that keeps track of where we've done read-ahead.

So now instead of a hash table in the inode, I have a structure
that contains two pointers: (1) hash table as before, and
(2) the bitmap to keep track of read-ahead requests.

I'm very happy with the results.  I've got a file system with
500,000 files in a single directory.  On my RHEL5 system, when
I perform the command: "time ls -fR /mnt/gfs2 > /dev/null" I
get these times:

Stock pre-release 5.8:   0m34.481s
Previously posted patch: 0m18.811s
With this patch:         0m9.843s

which is more than three times faster, and nearly twice as
fast as the version I previously posted.  I haven't tried my
"1000x1000" test on it yet.

The upstream version of the patch is given below.
There is one noticeable difference between RHEL5 and upstream:
I changed kmalloc to vmalloc.  Not sure your feelings on that.
Also, I have a feeling you're going to ask me to remove the
#define MAX_RA_BLOCKS and hard-code it to 32.  What can I say?

Regards,

Bob Peterson
Red Hat File Systems
--
commit 19ec5ffb5eaf915944861f8ebd4c065818a238c5
Author: Bob Peterson <rpeterso at redhat.com>
Date:   Thu Oct 6 10:34:05 2011 -0500

    GFS2: Add readahead to sequential directory traversal
    
    This patch adds read-ahead capability to GFS2's
    directory hash table management.  It greatly improves
    performance for some directory operations.  For example:
    In one of my file systems that has 1000 directories, each
    of which has 1000 files, time to execute a recursive
    ls (time ls -fR /mnt/gfs2 > /dev/null) was reduced
    from 2m2.814s on a stock kernel to 0m45.938s.

diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 2045d70..1c89ae0 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -329,48 +329,58 @@ fail:
  * gfs2_dir_get_hash_table - Get pointer to the dir hash table
  * @ip: The inode in question
  *
- * Returns: The hash table or an error
+ * Returns: an error code
  */
 
-static __be64 *gfs2_dir_get_hash_table(struct gfs2_inode *ip)
+static int gfs2_dir_get_hash_table(struct gfs2_inode *ip)
 {
 	struct inode *inode = &ip->i_inode;
 	int ret;
-	u32 hsize;
+	u32 hsize, rasize;
 	__be64 *hc;
+	u8 *ra;
 
 	BUG_ON(!(ip->i_diskflags & GFS2_DIF_EXHASH));
 
-	hc = ip->i_hash_cache;
-	if (hc)
-		return hc;
+	if (ip->i_hash_cache.h_ht)
+		return 0;
 
 	hsize = 1 << ip->i_depth;
+	rasize = hsize / 8; /* bit array */
 	hsize *= sizeof(__be64);
 	if (hsize != i_size_read(&ip->i_inode)) {
 		gfs2_consist_inode(ip);
-		return ERR_PTR(-EIO);
+		return -EIO;
 	}
 
-	hc = kmalloc(hsize, GFP_NOFS);
-	ret = -ENOMEM;
+	hc = vmalloc(hsize);
 	if (hc == NULL)
-		return ERR_PTR(-ENOMEM);
+		return -ENOMEM;
+
+	ra = vzalloc(rasize);
+	if (ra == NULL) {
+		vfree(hc);
+		return -ENOMEM;
+	}
 
 	ret = gfs2_dir_read_data(ip, hc, hsize);
 	if (ret < 0) {
-		kfree(hc);
-		return ERR_PTR(ret);
+		vfree(hc);
+		vfree(ra);
+		return ret;
 	}
 
 	spin_lock(&inode->i_lock);
-	if (ip->i_hash_cache)
-		kfree(hc);
-	else
-		ip->i_hash_cache = hc;
+	if (ip->i_hash_cache.h_ht) {
+		vfree(hc);
+		vfree(ra);
+	} else {
+		ip->i_hash_cache.h_ht = hc;
+		ip->i_hash_cache.h_ra = ra;
+	}
 	spin_unlock(&inode->i_lock);
 
-	return ip->i_hash_cache;
+	return 0;
 }
 
 /**
@@ -381,9 +391,12 @@ static __be64 *gfs2_dir_get_hash_table(struct gfs2_inode *ip)
  */
 void gfs2_dir_hash_inval(struct gfs2_inode *ip)
 {
-	__be64 *hc = ip->i_hash_cache;
-	ip->i_hash_cache = NULL;
-	kfree(hc);
+	__be64 *hc = ip->i_hash_cache.h_ht;
+	u8 *h_ra = ip->i_hash_cache.h_ra;
+	ip->i_hash_cache.h_ht = NULL;
+	ip->i_hash_cache.h_ra = NULL;
+	vfree(hc);
+	vfree(h_ra);
 }
 
 static inline int gfs2_dirent_sentinel(const struct gfs2_dirent *dent)
@@ -730,14 +743,15 @@ static int get_leaf(struct gfs2_inode *dip, u64 leaf_no,
  * Returns: 0 on success, error code otherwise
  */
 
-static int get_leaf_nr(struct gfs2_inode *dip, u32 index,
-		       u64 *leaf_out)
+static int get_leaf_nr(struct gfs2_inode *dip, u32 index, u64 *leaf_out)
 {
+	int error;
 	__be64 *hash;
 
-	hash = gfs2_dir_get_hash_table(dip);
-	if (IS_ERR(hash))
-		return PTR_ERR(hash);
+	error = gfs2_dir_get_hash_table(dip);
+	if (error)
+		return error;
+	hash = dip->i_hash_cache.h_ht;
 	*leaf_out = be64_to_cpu(*(hash + index));
 	return 0;
 }
@@ -1097,27 +1111,35 @@ fail_brelse:
 static int dir_double_exhash(struct gfs2_inode *dip)
 {
 	struct buffer_head *dibh;
-	u32 hsize;
+	u32 hsize, rasize;
 	u32 hsize_bytes;
-	__be64 *hc;
-	__be64 *hc2, *h;
+	__be64 *hc, *hc2, *h;
+	u8 *ra;
 	int x;
-	int error = 0;
+	int error;
 
 	hsize = 1 << dip->i_depth;
+	rasize = hsize / 8; /* bit array */
 	hsize_bytes = hsize * sizeof(__be64);
 
-	hc = gfs2_dir_get_hash_table(dip);
-	if (IS_ERR(hc))
-		return PTR_ERR(hc);
+	error = gfs2_dir_get_hash_table(dip);
+	if (error)
+		return error;
 
-	h = hc2 = kmalloc(hsize_bytes * 2, GFP_NOFS);
-	if (!hc2)
+	hc = dip->i_hash_cache.h_ht;
+	h = hc2 = vmalloc(hsize_bytes * 2);
+	if (h == NULL)
 		return -ENOMEM;
 
+	ra = vzalloc(rasize);
+	if (ra == NULL) {
+		error = -ENOMEM;
+		goto out_vfree;
+	}
+
 	error = gfs2_meta_inode_buffer(dip, &dibh);
 	if (error)
-		goto out_kfree;
+		goto out_rafree;
 
 	for (x = 0; x < hsize; x++) {
 		*h++ = *hc;
@@ -1130,7 +1152,8 @@ static int dir_double_exhash(struct gfs2_inode *dip)
 		goto fail;
 
 	gfs2_dir_hash_inval(dip);
-	dip->i_hash_cache = hc2;
+	dip->i_hash_cache.h_ht = hc2;
+	dip->i_hash_cache.h_ra = ra;
 	dip->i_depth++;
 	gfs2_dinode_out(dip, dibh->b_data);
 	brelse(dibh);
@@ -1142,8 +1165,11 @@ fail:
 	i_size_write(&dip->i_inode, hsize_bytes);
 	gfs2_dinode_out(dip, dibh->b_data);
 	brelse(dibh);
-out_kfree:
-	kfree(hc2);
+
+out_rafree:
+	vfree(ra);
+out_vfree:
+	vfree(hc2);
 	return error;
 }
 
@@ -1376,6 +1402,53 @@ out:
 	return error;
 }
 
+#define MAX_RA_BLOCKS 32
+
+/* gfs2_dir_readahead - Issue read-ahead requests for leaf blocks.
+ *
+ * Note: we can't calculate each index like dir_e_read can because we don't
+ * have the leaf, and therefore we don't have the depth, and therefore we
+ * don't have the length. So we have to just read enough ahead to make up
+ * for the loss of information. */
+static void gfs2_dir_readahead(struct inode *inode, unsigned hsize, u32 index)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_glock *gl = ip->i_gl;
+	struct buffer_head *bh;
+	u64 blocknr = 0, last;
+	unsigned count = 0;
+	int ra_byte, ra_bit;
+
+	while (index < hsize) {
+		ra_byte = index / 8;
+		ra_bit = index % 8;
+		last = blocknr;
+		blocknr = be64_to_cpu(ip->i_hash_cache.h_ht[index++]);
+		if (blocknr == last)
+			continue;
+		count++;
+		if (count > MAX_RA_BLOCKS)
+			break;
+
+		/* figure out if we've already requested this block */
+		if ((ip->i_hash_cache.h_ra[ra_byte] >> ra_bit) & 0x01)
+			continue;
+
+		ip->i_hash_cache.h_ra[ra_byte] |= (1 << ra_bit);
+		bh = gfs2_getbuf(gl, blocknr, 1);
+		if (trylock_buffer(bh)) {
+			if (buffer_uptodate(bh)) {
+				unlock_buffer(bh);
+				brelse(bh);
+				continue;
+			}
+			bh->b_end_io = end_buffer_read_sync;
+			submit_bh(READA | REQ_META, bh);
+			continue;
+		}
+		brelse(bh);
+	}
+}
 
 /**
  * dir_e_read - Reads the entries from a directory into a filldir buffer
@@ -1391,20 +1464,23 @@ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
 		      filldir_t filldir)
 {
 	struct gfs2_inode *dip = GFS2_I(inode);
-	u32 hsize, len = 0;
+	u32 hsize, len;
 	u32 hash, index;
 	__be64 *lp;
 	int copied = 0;
-	int error = 0;
+	int error;
 	unsigned depth = 0;
 
 	hsize = 1 << dip->i_depth;
 	hash = gfs2_dir_offset2hash(*offset);
 	index = hash >> (32 - dip->i_depth);
 
-	lp = gfs2_dir_get_hash_table(dip);
-	if (IS_ERR(lp))
-		return PTR_ERR(lp);
+	error = gfs2_dir_get_hash_table(dip);
+	if (error)
+		return error;
+
+	lp = dip->i_hash_cache.h_ht;
+	gfs2_dir_readahead(inode, hsize, index);
 
 	while (index < hsize) {
 		error = gfs2_dir_read_leaf(inode, offset, opaque, filldir,
@@ -1925,14 +2001,13 @@ int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip)
 	u32 index = 0, next_index;
 	__be64 *lp;
 	u64 leaf_no;
-	int error = 0, last;
+	int error, last;
 
 	hsize = 1 << dip->i_depth;
-
-	lp = gfs2_dir_get_hash_table(dip);
-	if (IS_ERR(lp))
-		return PTR_ERR(lp);
-
+	error = gfs2_dir_get_hash_table(dip);
+	if (error)
+		return error;
+	lp = dip->i_hash_cache.h_ht;
 	while (index < hsize) {
 		leaf_no = be64_to_cpu(lp[index]);
 		if (leaf_no) {
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 892ac37..731d763 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -271,6 +271,11 @@ enum {
 };
 
 
+struct gfs2_hash_table {
+	__be64 *h_ht; /* hash table data */
+	u8 *h_ra; /* array indicating whether hash table block was read */
+};
+
 struct gfs2_inode {
 	struct inode i_inode;
 	u64 i_no_addr;
@@ -285,7 +290,7 @@ struct gfs2_inode {
 	u64 i_goal;	/* goal block for allocations */
 	struct rw_semaphore i_rw_mutex;
 	struct list_head i_trunc_list;
-	__be64 *i_hash_cache;
+	struct gfs2_hash_table i_hash_cache;
 	u32 i_entries;
 	u32 i_diskflags;
 	u8 i_height;
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 8a139ff..a834b88 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -41,7 +41,8 @@ static void gfs2_init_inode_once(void *foo)
 	init_rwsem(&ip->i_rw_mutex);
 	INIT_LIST_HEAD(&ip->i_trunc_list);
 	ip->i_alloc = NULL;
-	ip->i_hash_cache = NULL;
+	ip->i_hash_cache.h_ht = NULL;
+	ip->i_hash_cache.h_ra = NULL;
 }
 
 static void gfs2_init_glock_once(void *foo)