[Cluster-devel] [GFS2 Patch] GFS2: Add readahead to sequential directory traversal
Bob Peterson
rpeterso at redhat.com
Thu Oct 6 16:15:12 UTC 2011
----- Original Message -----
| Hi,
|
| Were we not intending to make this turn itself off in cases where it
| produces no benefit? I thought the plan was to track the state via
| the
| file descriptor in order to avoid readingahead the same blocks over
| and
| over again too,
|
| Steve.
Hi Steve,
Based on our discussion, I revised the patch to implement a
scheme whereby we don't duplicate read-ahead requests. However,
I decided against using the file descriptor in favor of a
simple bitmap that keeps track of where we've done read-ahead.
So now instead of a hash table in the inode, I have a structure
that contains two pointers: (1) hash table as before, and
(2) the bitmap to keep track of read-ahead requests.
I'm very happy with the results. I've got a file system with
500,000 files in a single directory. On my RHEL5 system, when
I perform the command: "time ls -fR /mnt/gfs2 > /dev/null" I
get these times:
Stock pre-release 5.8: 0m34.481s
Previously posted patch: 0m18.811s
With this patch: 0m9.843s
which is more than three times faster, and nearly twice as
fast as the version I previously posted. I haven't tried my
"1000x1000" test on it yet.
The upstream version of the patch is given below.
There is one noticeable difference between RHEL5 and upstream:
I changed kmalloc to vmalloc. Not sure your feelings on that.
Also, I have a feeling you're going to ask me to remove the
#define MAX_RA_BLOCKS and hard-code it to 32. What can I say?
Regards,
Bob Peterson
Red Hat File Systems
--
commit 19ec5ffb5eaf915944861f8ebd4c065818a238c5
Author: Bob Peterson <rpeterso at redhat.com>
Date: Thu Oct 6 10:34:05 2011 -0500
GFS2: Add readahead to sequential directory traversal
This patch adds read-ahead capability to GFS2's
directory hash table management. It greatly improves
performance for some directory operations. For example:
In one of my file systems that has 1000 directories, each
of which has 1000 files, time to execute a recursive
ls (time ls -fR /mnt/gfs2 > /dev/null) was reduced
from 2m2.814s on a stock kernel to 0m45.938s.
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 2045d70..1c89ae0 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -329,48 +329,58 @@ fail:
* gfs2_dir_get_hash_table - Get pointer to the dir hash table
* @ip: The inode in question
*
- * Returns: The hash table or an error
+ * Returns: an error code
*/
-static __be64 *gfs2_dir_get_hash_table(struct gfs2_inode *ip)
+static int gfs2_dir_get_hash_table(struct gfs2_inode *ip)
{
struct inode *inode = &ip->i_inode;
int ret;
- u32 hsize;
+ u32 hsize, rasize;
__be64 *hc;
+ u8 *ra;
BUG_ON(!(ip->i_diskflags & GFS2_DIF_EXHASH));
- hc = ip->i_hash_cache;
- if (hc)
- return hc;
+ if (ip->i_hash_cache.h_ht)
+ return 0;
hsize = 1 << ip->i_depth;
+ rasize = hsize / 8; /* bit array */
hsize *= sizeof(__be64);
if (hsize != i_size_read(&ip->i_inode)) {
gfs2_consist_inode(ip);
- return ERR_PTR(-EIO);
+ return -EIO;
}
- hc = kmalloc(hsize, GFP_NOFS);
- ret = -ENOMEM;
+ hc = vmalloc(hsize);
if (hc == NULL)
- return ERR_PTR(-ENOMEM);
+ return -ENOMEM;
+
+ ra = vzalloc(rasize);
+ if (ra == NULL) {
+ vfree(hc);
+ return -ENOMEM;
+ }
ret = gfs2_dir_read_data(ip, hc, hsize);
if (ret < 0) {
- kfree(hc);
- return ERR_PTR(ret);
+ vfree(hc);
+ vfree(ra);
+ return ret;
}
spin_lock(&inode->i_lock);
- if (ip->i_hash_cache)
- kfree(hc);
- else
- ip->i_hash_cache = hc;
+ if (ip->i_hash_cache.h_ht) {
+ vfree(hc);
+ vfree(ra);
+ } else {
+ ip->i_hash_cache.h_ht = hc;
+ ip->i_hash_cache.h_ra = ra;
+ }
spin_unlock(&inode->i_lock);
- return ip->i_hash_cache;
+ return 0;
}
/**
@@ -381,9 +391,12 @@ static __be64 *gfs2_dir_get_hash_table(struct gfs2_inode *ip)
*/
void gfs2_dir_hash_inval(struct gfs2_inode *ip)
{
- __be64 *hc = ip->i_hash_cache;
- ip->i_hash_cache = NULL;
- kfree(hc);
+ __be64 *hc = ip->i_hash_cache.h_ht;
+ u8 *h_ra = ip->i_hash_cache.h_ra;
+ ip->i_hash_cache.h_ht = NULL;
+ ip->i_hash_cache.h_ra = NULL;
+ vfree(hc);
+ vfree(h_ra);
}
static inline int gfs2_dirent_sentinel(const struct gfs2_dirent *dent)
@@ -730,14 +743,15 @@ static int get_leaf(struct gfs2_inode *dip, u64 leaf_no,
* Returns: 0 on success, error code otherwise
*/
-static int get_leaf_nr(struct gfs2_inode *dip, u32 index,
- u64 *leaf_out)
+static int get_leaf_nr(struct gfs2_inode *dip, u32 index, u64 *leaf_out)
{
+ int error;
__be64 *hash;
- hash = gfs2_dir_get_hash_table(dip);
- if (IS_ERR(hash))
- return PTR_ERR(hash);
+ error = gfs2_dir_get_hash_table(dip);
+ if (error)
+ return error;
+ hash = dip->i_hash_cache.h_ht;
*leaf_out = be64_to_cpu(*(hash + index));
return 0;
}
@@ -1097,27 +1111,35 @@ fail_brelse:
static int dir_double_exhash(struct gfs2_inode *dip)
{
struct buffer_head *dibh;
- u32 hsize;
+ u32 hsize, rasize;
u32 hsize_bytes;
- __be64 *hc;
- __be64 *hc2, *h;
+ __be64 *hc, *hc2, *h;
+ u8 *ra;
int x;
- int error = 0;
+ int error;
hsize = 1 << dip->i_depth;
+ rasize = hsize / 8; /* bit array */
hsize_bytes = hsize * sizeof(__be64);
- hc = gfs2_dir_get_hash_table(dip);
- if (IS_ERR(hc))
- return PTR_ERR(hc);
+ error = gfs2_dir_get_hash_table(dip);
+ if (error)
+ return error;
- h = hc2 = kmalloc(hsize_bytes * 2, GFP_NOFS);
- if (!hc2)
+ hc = dip->i_hash_cache.h_ht;
+ h = hc2 = vmalloc(hsize_bytes * 2);
+ if (h == NULL)
return -ENOMEM;
+ ra = vzalloc(rasize);
+ if (ra == NULL) {
+ error = -ENOMEM;
+ goto out_vfree;
+ }
+
error = gfs2_meta_inode_buffer(dip, &dibh);
if (error)
- goto out_kfree;
+ goto out_rafree;
for (x = 0; x < hsize; x++) {
*h++ = *hc;
@@ -1130,7 +1152,8 @@ static int dir_double_exhash(struct gfs2_inode *dip)
goto fail;
gfs2_dir_hash_inval(dip);
- dip->i_hash_cache = hc2;
+ dip->i_hash_cache.h_ht = hc2;
+ dip->i_hash_cache.h_ra = ra;
dip->i_depth++;
gfs2_dinode_out(dip, dibh->b_data);
brelse(dibh);
@@ -1142,8 +1165,11 @@ fail:
i_size_write(&dip->i_inode, hsize_bytes);
gfs2_dinode_out(dip, dibh->b_data);
brelse(dibh);
-out_kfree:
- kfree(hc2);
+
+out_rafree:
+ vfree(ra);
+out_vfree:
+ vfree(hc2);
return error;
}
@@ -1376,6 +1402,53 @@ out:
return error;
}
+#define MAX_RA_BLOCKS 32
+
+/* gfs2_dir_readahead - Issue read-ahead requests for leaf blocks.
+ *
+ * Note: we can't calculate each index like dir_e_read can because we don't
+ * have the leaf, and therefore we don't have the depth, and therefore we
+ * don't have the length. So we have to just read enough ahead to make up
+ * for the loss of information. */
+static void gfs2_dir_readahead(struct inode *inode, unsigned hsize, u32 index)
+{
+ struct gfs2_inode *ip = GFS2_I(inode);
+ struct gfs2_glock *gl = ip->i_gl;
+ struct buffer_head *bh;
+ u64 blocknr = 0, last;
+ unsigned count = 0;
+ int ra_byte, ra_bit;
+
+ while (index < hsize) {
+ ra_byte = index / 8;
+ ra_bit = index % 8;
+ last = blocknr;
+ blocknr = be64_to_cpu(ip->i_hash_cache.h_ht[index++]);
+ if (blocknr == last)
+ continue;
+ count++;
+ if (count > MAX_RA_BLOCKS)
+ break;
+
+ /* figure out if we've already requested this block */
+ if ((ip->i_hash_cache.h_ra[ra_byte] >> ra_bit) & 0x01)
+ continue;
+
+ ip->i_hash_cache.h_ra[ra_byte] |= (1 << ra_bit);
+ bh = gfs2_getbuf(gl, blocknr, 1);
+ if (trylock_buffer(bh)) {
+ if (buffer_uptodate(bh)) {
+ unlock_buffer(bh);
+ brelse(bh);
+ continue;
+ }
+ bh->b_end_io = end_buffer_read_sync;
+ submit_bh(READA | REQ_META, bh);
+ continue;
+ }
+ brelse(bh);
+ }
+}
/**
* dir_e_read - Reads the entries from a directory into a filldir buffer
@@ -1391,20 +1464,23 @@ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
filldir_t filldir)
{
struct gfs2_inode *dip = GFS2_I(inode);
- u32 hsize, len = 0;
+ u32 hsize, len;
u32 hash, index;
__be64 *lp;
int copied = 0;
- int error = 0;
+ int error;
unsigned depth = 0;
hsize = 1 << dip->i_depth;
hash = gfs2_dir_offset2hash(*offset);
index = hash >> (32 - dip->i_depth);
- lp = gfs2_dir_get_hash_table(dip);
- if (IS_ERR(lp))
- return PTR_ERR(lp);
+ error = gfs2_dir_get_hash_table(dip);
+ if (error)
+ return error;
+
+ lp = dip->i_hash_cache.h_ht;
+ gfs2_dir_readahead(inode, hsize, index);
while (index < hsize) {
error = gfs2_dir_read_leaf(inode, offset, opaque, filldir,
@@ -1925,14 +2001,13 @@ int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip)
u32 index = 0, next_index;
__be64 *lp;
u64 leaf_no;
- int error = 0, last;
+ int error, last;
hsize = 1 << dip->i_depth;
-
- lp = gfs2_dir_get_hash_table(dip);
- if (IS_ERR(lp))
- return PTR_ERR(lp);
-
+ error = gfs2_dir_get_hash_table(dip);
+ if (error)
+ return error;
+ lp = dip->i_hash_cache.h_ht;
while (index < hsize) {
leaf_no = be64_to_cpu(lp[index]);
if (leaf_no) {
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 892ac37..731d763 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -271,6 +271,11 @@ enum {
};
+struct gfs2_hash_table {
+ __be64 *h_ht; /* hash table data */
+ u8 *h_ra; /* array indicating whether hash table block was read */
+};
+
struct gfs2_inode {
struct inode i_inode;
u64 i_no_addr;
@@ -285,7 +290,7 @@ struct gfs2_inode {
u64 i_goal; /* goal block for allocations */
struct rw_semaphore i_rw_mutex;
struct list_head i_trunc_list;
- __be64 *i_hash_cache;
+ struct gfs2_hash_table i_hash_cache;
u32 i_entries;
u32 i_diskflags;
u8 i_height;
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 8a139ff..a834b88 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -41,7 +41,8 @@ static void gfs2_init_inode_once(void *foo)
init_rwsem(&ip->i_rw_mutex);
INIT_LIST_HEAD(&ip->i_trunc_list);
ip->i_alloc = NULL;
- ip->i_hash_cache = NULL;
+ ip->i_hash_cache.h_ht = NULL;
+ ip->i_hash_cache.h_ra = NULL;
}
static void gfs2_init_glock_once(void *foo)
More information about the Cluster-devel
mailing list