[Cluster-devel] [GFS2] Track metadata via the pagecache

Steven Whitehouse swhiteho at redhat.com
Wed Dec 12 16:27:57 UTC 2007


>From b9a77bfa8cfee4de5e0879e5b12b7a73f9bcc137 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho at redhat.com>
Date: Tue, 11 Dec 2007 09:51:50 +0000
Subject: [PATCH] [GFS2] Track metadata via the pagecache

Rather than using the fact that we have a per-inode metadata
address space, this patch changes the way we track metadata
buffers so that we iterate over a range of pages, and in the
invalidate case we check the buffers in that range match the
glock which we are using. In the sync case, we short circuit
that by just sync'ing everything (all metadata) between the two
limits being tracked by the glock.

In the normal case, this is very efficient as all the metadata
for an inode is concentrated in a single place.

This patch requires two new exports in order that we can sync
and wait for a range of pages.

The reason for making this change is to prepare for removing
the per-inode metadata address space and replacing it with
a single address space for all metadata. That change is done
in the next patch.

Signed-off-by: Steven Whitehouse <swhiteho at redhat.com>

diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index a7f3c46..6ad3f6c 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -346,6 +346,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
 	gl->gl_sbd = sdp;
 	gl->gl_aspace = NULL;
 	INIT_DELAYED_WORK(&gl->gl_work, glock_work_func);
+	gl->gl_start = gl->gl_end = number;
 
 	/* If this glock protects actual on-disk data or metadata blocks,
 	   create a VFS inode to manage the pages/buffers holding them. */
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index c663b7a..26640c1 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -13,6 +13,8 @@
 #include <linux/buffer_head.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/lm_interface.h>
+#include <linux/pagevec.h>
+#include <linux/writeback.h>
 
 #include "gfs2.h"
 #include "incore.h"
@@ -70,26 +72,103 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl)
 	gfs2_log_flush(sdp, NULL);
 }
 
-/**
- * gfs2_pte_inval - Sync and invalidate all PTEs associated with a glock
- * @gl: the glock
- *
- */
-
-static void gfs2_pte_inval(struct gfs2_glock *gl)
+static int gfs2_zap_glock_buffers(struct buffer_head *bh)
+{
+	BUG_ON(buffer_dirty(bh));
+	clear_buffer_uptodate(bh);
+	if (test_clear_buffer_glock(bh))
+		bh->b_private = NULL;
+	/* We leave any gfs2_bufdata for releasepage */
+	return 0;
+}
+  
+static int gfs2_wait_on_meta_pages(struct buffer_head *bh)
 {
-	struct gfs2_inode *ip;
-	struct inode *inode;
+	wait_on_page_writeback(bh->b_page);
+	return 1;
+}
+  
+static void gfs2_foreach_page(struct gfs2_glock *gl,
+			      int (*fxn)(struct buffer_head *bh))
+{
+	struct address_space *mapping = gl->gl_aspace->i_mapping;
+	struct gfs2_sbd *sdp = gl->gl_sbd;
+	unsigned shift = PAGE_CACHE_SHIFT - sdp->sd_sb.sb_bsize_shift;
+	pgoff_t start = (pgoff_t)(gl->gl_start >> shift);
+	pgoff_t end = (pgoff_t)(gl->gl_end >> shift);
+	struct pagevec pvec;
+	pgoff_t next;
+
+	pagevec_init(&pvec, 0);
+	next = start;
+	while (next <= end &&
+	       pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
+		int i;
+		for(i = 0; i < pagevec_count(&pvec); i++) {
+			struct page *page = pvec.pages[i];
+			lock_page(page);
+			if (likely(page_has_buffers(page))) {
+				struct buffer_head *bh, *head;
+				bh = head = page_buffers(page);
+				do {
+					if (gl == bh_to_glock(bh)) {
+						if (fxn(bh))
+							break;
+					}
+					bh = bh->b_this_page;
+				} while(bh != head);
+			}
+			unlock_page(page);
+			next++;
+		}
+		pagevec_release(&pvec);
+		cond_resched();
+	}
+}
 
-	ip = gl->gl_object;
-	inode = &ip->i_inode;
-	if (!ip || !S_ISREG(inode->i_mode))
-		return;
+void gfs2_meta_inval(struct gfs2_glock *gl)
+{
+	gfs2_foreach_page(gl, gfs2_zap_glock_buffers);
+	gl->gl_start = gl->gl_end = gl->gl_name.ln_number;
+}
 
-	unmap_shared_mapping_range(inode->i_mapping, 0, 0);
-	if (test_bit(GIF_SW_PAGED, &ip->i_flags))
-		set_bit(GLF_DIRTY, &gl->gl_flags);
+static int write_meta_pages(struct page *page, struct writeback_control *wbc,
+			    void *data)
+{
+	struct gfs2_glock *gl = data;
+	struct buffer_head *bh, *head;
+	int ret;
+ 
+	bh = head = page_buffers(page);
+	do {
+		if (gl == bh_to_glock(bh) && buffer_dirty(bh))
+			goto do_writepage;
+		bh = bh->b_this_page;
+	} while(bh != head);
+	redirty_page_for_writepage(wbc, page);
+	unlock_page(page);
+	return 0;
+do_writepage:
+	ret = page->mapping->a_ops->writepage(page, wbc);
+	if (ret)
+		mapping_set_error(page->mapping, ret);
+	return ret;
+}
 
+static int gfs2_meta_sync(struct gfs2_glock *gl)
+{
+	struct address_space *mapping = gl->gl_aspace->i_mapping;
+	struct gfs2_sbd *sdp = gl->gl_sbd;
+	struct writeback_control wbc = {
+		.sync_mode = WB_SYNC_ALL,
+		.nr_to_write = LONG_MAX,
+		.range_start = gl->gl_start << sdp->sd_sb.sb_bsize_shift,
+		.range_end = gl->gl_end << sdp->sd_sb.sb_bsize_shift,
+	};
+	int ret = write_cache_pages(mapping, &wbc, write_meta_pages, gl);
+	if (!ret)
+		gfs2_foreach_page(gl, gfs2_wait_on_meta_pages);
+	return ret;
 }
 
 /**
@@ -103,14 +182,10 @@ static void gfs2_pte_inval(struct gfs2_glock *gl)
 
 static void meta_go_sync(struct gfs2_glock *gl)
 {
-	if (gl->gl_state != LM_ST_EXCLUSIVE)
-		return;
-
-	if (test_and_clear_bit(GLF_DIRTY, &gl->gl_flags)) {
-		gfs2_log_flush(gl->gl_sbd, gl);
+	gfs2_log_flush(gl->gl_sbd, gl);
+	if (test_and_clear_bit(GLF_DIRTY, &gl->gl_flags))
 		gfs2_meta_sync(gl);
-		gfs2_ail_empty_gl(gl);
-	}
+	gfs2_ail_empty_gl(gl);
 }
 
 /**
@@ -122,9 +197,6 @@ static void meta_go_sync(struct gfs2_glock *gl)
 
 static void meta_go_inval(struct gfs2_glock *gl, int flags)
 {
-	if (!(flags & DIO_METADATA))
-		return;
-
 	gfs2_meta_inval(gl);
 	gl->gl_vn++;
 }
@@ -138,31 +210,21 @@ static void meta_go_inval(struct gfs2_glock *gl, int flags)
 static void inode_go_sync(struct gfs2_glock *gl)
 {
 	struct gfs2_inode *ip = gl->gl_object;
-	struct address_space *metamapping = gl->gl_aspace->i_mapping;
-	int error;
 
-	if (gl->gl_state != LM_ST_UNLOCKED)
-		gfs2_pte_inval(gl);
-	if (gl->gl_state != LM_ST_EXCLUSIVE)
-		return;
-
-	if (ip && !S_ISREG(ip->i_inode.i_mode))
-		ip = NULL;
-
-	if (test_bit(GLF_DIRTY, &gl->gl_flags)) {
-		gfs2_log_flush(gl->gl_sbd, gl);
-		filemap_fdatawrite(metamapping);
-		if (ip) {
-			struct address_space *mapping = ip->i_inode.i_mapping;
-			filemap_fdatawrite(mapping);
-			error = filemap_fdatawait(mapping);
-			mapping_set_error(mapping, error);
+	gfs2_log_flush(gl->gl_sbd, gl);
+	if (ip && S_ISREG(ip->i_inode.i_mode)) {
+		struct inode *inode = &ip->i_inode;
+		struct address_space *mapping = inode->i_mapping;
+		int mapped = mapping->i_mmap_writable;
+		unmap_shared_mapping_range(mapping, 0, 0);
+		if (test_and_clear_bit(GLF_DIRTY, &gl->gl_flags) || mapped) {
+			generic_osync_inode(inode, mapping,
+					    OSYNC_DATA|OSYNC_METADATA);
 		}
-		error = filemap_fdatawait(metamapping);
-		mapping_set_error(metamapping, error);
-		clear_bit(GLF_DIRTY, &gl->gl_flags);
-		gfs2_ail_empty_gl(gl);
+	} else {
+		gfs2_meta_sync(gl);
 	}
+	gfs2_ail_empty_gl(gl);
 }
 
 /**
@@ -363,7 +425,6 @@ static int quota_go_demote_ok(struct gfs2_glock *gl)
 }
 
 const struct gfs2_glock_operations gfs2_meta_glops = {
-	.go_xmote_th = meta_go_sync,
 	.go_type = LM_TYPE_META,
 };
 
diff --git a/fs/gfs2/glops.h b/fs/gfs2/glops.h
index a1d9b5b..812f7fb 100644
--- a/fs/gfs2/glops.h
+++ b/fs/gfs2/glops.h
@@ -22,4 +22,6 @@ extern const struct gfs2_glock_operations gfs2_nondisk_glops;
 extern const struct gfs2_glock_operations gfs2_quota_glops;
 extern const struct gfs2_glock_operations gfs2_journal_glops;
 
+extern void gfs2_meta_inval(struct gfs2_glock *gl);
+
 #endif /* __GLOPS_DOT_H__ */
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index b2487cf..2889a43 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -249,6 +249,8 @@ struct gfs2_glock {
 	struct list_head gl_ail_list;
 	atomic_t gl_ail_count;
 	struct delayed_work gl_work;
+	u64 gl_start;
+	u64 gl_end;
 };
 
 struct gfs2_alloc {
@@ -274,7 +276,6 @@ struct gfs2_alloc {
 enum {
 	GIF_INVALID		= 0,
 	GIF_QD_LOCKED		= 1,
-	GIF_SW_PAGED		= 3,
 };
 
 struct gfs2_dinode_host {
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 53bca99..2017c5a 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -369,7 +369,6 @@ int gfs2_dinode_dealloc(struct gfs2_inode *ip)
 	if (error)
 		goto out_rg_gunlock;
 
-	set_bit(GLF_DIRTY, &ip->i_gl->gl_flags);
 	set_bit(GLF_LFLUSH, &ip->i_gl->gl_flags);
 
 	gfs2_free_di(rgd, ip);
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index ceae8cf..58e51ef 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -311,18 +311,11 @@ static int buf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
 
 static void buf_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)
 {
-	struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
 	struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
 
-	if (error) {
-		gfs2_meta_sync(ip->i_gl);
-		return;
-	}
-	if (pass != 1)
+	if (error || pass != 1)
 		return;
 
-	gfs2_meta_sync(ip->i_gl);
-
 	fs_info(sdp, "jid=%u: Replayed %u of %u blocks\n",
 	        jd->jd_jid, sdp->sd_replayed_blocks, sdp->sd_found_blocks);
 }
@@ -709,19 +702,11 @@ static int databuf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
 
 static void databuf_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)
 {
-	struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
 	struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
 
-	if (error) {
-		gfs2_meta_sync(ip->i_gl);
-		return;
-	}
-	if (pass != 1)
+	if (error || pass != 1)
 		return;
 
-	/* data sync? */
-	gfs2_meta_sync(ip->i_gl);
-
 	fs_info(sdp, "jid=%u: Replayed %u of %u data blocks\n",
 		jd->jd_jid, sdp->sd_replayed_blocks, sdp->sd_found_blocks);
 }
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 2ffee4f..cc93fca 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -88,45 +88,6 @@ void gfs2_aspace_put(struct inode *aspace)
 }
 
 /**
- * gfs2_meta_inval - Invalidate all buffers associated with a glock
- * @gl: the glock
- *
- */
-
-void gfs2_meta_inval(struct gfs2_glock *gl)
-{
-	struct gfs2_sbd *sdp = gl->gl_sbd;
-	struct inode *aspace = gl->gl_aspace;
-	struct address_space *mapping = gl->gl_aspace->i_mapping;
-
-	gfs2_assert_withdraw(sdp, !atomic_read(&gl->gl_ail_count));
-
-	atomic_inc(&aspace->i_writecount);
-	truncate_inode_pages(mapping, 0);
-	atomic_dec(&aspace->i_writecount);
-
-	gfs2_assert_withdraw(sdp, !mapping->nrpages);
-}
-
-/**
- * gfs2_meta_sync - Sync all buffers associated with a glock
- * @gl: The glock
- *
- */
-
-void gfs2_meta_sync(struct gfs2_glock *gl)
-{
-	struct address_space *mapping = gl->gl_aspace->i_mapping;
-	int error;
-
-	filemap_fdatawrite(mapping);
-	error = filemap_fdatawait(mapping);
-
-	if (error)
-		gfs2_io_error(gl->gl_sbd);
-}
-
-/**
  * getbuf - Get a buffer with a given address space
  * @gl: the glock
  * @blkno: the block number (filesystem scope)
@@ -182,6 +143,13 @@ static struct buffer_head *getbuf(struct gfs2_glock *gl, u64 blkno, int create)
 	mark_page_accessed(page);
 	page_cache_release(page);
 
+	spin_lock(&gl->gl_spin);
+	if (blkno > gl->gl_end)
+		gl->gl_end = blkno;
+	else if (blkno < gl->gl_start)
+		gl->gl_start = blkno;
+	spin_unlock(&gl->gl_spin);
+
 	return bh;
 }
 
diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h
index dba789c..e276853 100644
--- a/fs/gfs2/meta_io.h
+++ b/fs/gfs2/meta_io.h
@@ -40,9 +40,6 @@ static inline void gfs2_buffer_copy_tail(struct buffer_head *to_bh,
 struct inode *gfs2_aspace_get(struct gfs2_sbd *sdp);
 void gfs2_aspace_put(struct inode *aspace);
 
-void gfs2_meta_inval(struct gfs2_glock *gl);
-void gfs2_meta_sync(struct gfs2_glock *gl);
-
 struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno);
 int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno,
 		   int flags, struct buffer_head **bhp);
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
index 597f7ff..f3cc02b 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/ops_file.c
@@ -359,7 +359,6 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page)
 	if (ret)
 		goto out;
 
-	set_bit(GIF_SW_PAGED, &ip->i_flags);
 	gfs2_write_calc_reserv(ip, PAGE_CACHE_SIZE, &data_blocks, &ind_blocks);
 	ret = gfs2_write_alloc_required(ip, pos, PAGE_CACHE_SIZE, &alloc_required);
 	if (ret || !alloc_required)
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index 27c994f..2658f6c 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -522,6 +522,12 @@ int gfs2_recover_journal(struct gfs2_jdesc *jd)
 			error = foreach_descriptor(jd, head.lh_tail,
 						   head.lh_blkno, pass);
 			lops_after_scan(jd, error, pass);
+			if (error || pass == 1) {
+				generic_osync_inode(jd->jd_inode,
+						    jd->jd_inode->i_mapping,
+						    OSYNC_DATA|OSYNC_METADATA);
+				gfs2_meta_inval(GFS2_I(jd->jd_inode)->i_gl);
+			}
 			if (error)
 				goto fail_gunlock_tr;
 		}
-- 
1.5.1.2






More information about the Cluster-devel mailing list