[Cluster-devel] cluster/cmirror-kernel/src dm-cmirror-client.c ...
jbrassow at sourceware.org
jbrassow at sourceware.org
Thu Apr 5 21:32:27 UTC 2007
CVSROOT: /cvs/cluster
Module name: cluster
Branch: RHEL4
Changes by: jbrassow at sourceware.org 2007-04-05 22:32:26
Modified files:
cmirror-kernel/src: dm-cmirror-client.c dm-cmirror-server.c
Log message:
Bug 234918 Processed: NMI Watchdog detected LOCKUP while running proces...
Bug 217438: scrolling kernel requests to mark mirror regions
Item 1:
I needed to check for marked regions when getting resync work, not
just check for resyncing regions when a mark/flush happens.
Item 2:
There is a corner case that allows two calls to clear the same
region. The second does not need to be logged.
Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-client.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.43&r2=1.1.2.44
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-server.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.29&r2=1.1.2.30
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c 2007/04/03 18:21:10 1.1.2.43
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c 2007/04/05 21:32:25 1.1.2.44
@@ -1034,7 +1034,9 @@
spin_lock(&lc->state_lock);
- /* Should find match in this list, or no lists at all */
+ /*
+ * The nominal case is to find the region in the marked list
+ */
list_for_each_entry_safe(rs, tmp_rs, &lc->mark_logged, rs_list){
if(region == rs->rs_region){
list_del_init(&rs->rs_list);
@@ -1043,28 +1045,46 @@
}
}
-
- list_for_each_entry_safe(rs, tmp_rs, &lc->mark_waiting, rs_list){
+ /*
+ * It is possible, but unlikely to get to this case. It requires
+ * the following to happen:
+ * 1) mark the region for writing
+ * 2) clear the region
+ * 3) clear doesn't get flushed because of bug 235040
+ * 4) suspend due to server relocation
+ * 5) on-disk log says we need to recover (because it hasn't been cleared)
+ * 6) we recover the region
+ * 7) clearing the region after recovery causes us to get here
+ *
+ * Once 235040 is cleared, any entries found in this list should
+ * cause a bug.
+ */
+ list_for_each_entry_safe(rs, tmp_rs, &lc->clear_waiting, rs_list){
if(region == rs->rs_region){
- DMERR("Clear pre-empting mark (%Lu/%s)",
- region, lc->uuid + (strlen(lc->uuid) - 8));
- BUG();
+ DMERR("%d) Double clear on region ("
+ SECTOR_FORMAT ")", __LINE__, region);
+ goto out;
}
}
- list_for_each_entry_safe(rs, tmp_rs, &lc->clear_waiting, rs_list){
+ list_for_each_entry_safe(rs, tmp_rs, &lc->mark_waiting, rs_list){
if(region == rs->rs_region){
- DMERR("%d) Double clear on region ("
- SECTOR_FORMAT ")", __LINE__, region);
+ DMERR("Clear pre-empting mark (%Lu/%s)",
+ region, lc->uuid + (strlen(lc->uuid) - 8));
BUG();
}
}
+
/* We can get here because we may be doing resync_work, and therefore,**
** clearing without ever marking..................................... */
/* Don't need to spin_unlock, because allocation is non-blocking */
rs_new = mempool_alloc(region_state_pool, GFP_ATOMIC);
- BUG_ON(!rs_new);
+ if (!rs_new) {
+ DMERR("Failed to allocate space for clear region request: %Lu",
+ region);
+ BUG();
+ }
memset(rs_new, 0, sizeof(struct region_state));
rs_new->rs_region = region;
@@ -1088,6 +1108,21 @@
DMWARN("Error while getting resync work: bad region");
rtn = 0;
}
+
+ /*
+ * Check for bug 235039
+ * Note the changes in cluser_clear_region
+ */
+ if (rtn == 1) {
+ struct region_state *rs, *tmp_rs;
+ list_for_each_entry_safe(rs, tmp_rs, &lc->clear_waiting, rs_list) {
+ if (*region == rs->rs_region) {
+ DMERR("WARNING: Bug 235039/235040 detected!");
+ DMERR("Work-around in place.");
+ }
+ }
+ }
+
return rtn;
}
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c 2007/04/04 21:35:23 1.1.2.29
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c 2007/04/05 21:32:25 1.1.2.30
@@ -656,6 +656,8 @@
static int server_complete_resync_work(struct log_c *lc, struct log_request *lr, int success){
+ struct region_user *ru;
+
if (lr->u.lr_region > lc->region_count) {
return -EINVAL;
}
@@ -678,6 +680,42 @@
DMDEBUG("Resync work completed: %Lu", lr->u.lr_region);
} else if (log_test_bit(lc->sync_bits, lr->u.lr_region)) {
+ ru = find_ru_by_region(lc, lr->u.lr_region);
+
+ /*
+ * The following condition can never happen unless we have
+ * a corrupted list or we had a communication error.
+ *
+ * If a write failed to one of the mirror devices, the ru
+ * should be RU_WRITE. If a recovery failed, it should be
+ * RU_RECOVER
+ */
+ if (!ru) {
+ DMERR("Unable to find region being marked out-of-sync: %Lu",
+ lr->u.lr_region);
+ return -EINVAL;
+ }
+
+ if (ru->ru_rw == RU_RECOVER) {
+ if (lr->u.lr_region != lc->recovering_region) {
+ DMERR("Recovering region mismatch: (%Lu/%Lu)",
+ lr->u.lr_region, lc->recovering_region);
+ BUG();
+ }
+ /*
+ * Clear the recovery
+ */
+ lc->recovering_region = (uint64_t)-1;
+ list_del(&ru->ru_list);
+ mempool_free(ru, region_user_pool);
+ } else { /* ru->ru_rw == RU_WRITE */
+ /*
+ * Mirror has place the region into RH_NOSYNC
+ * It is safe to pull the ru
+ */
+ list_del(&ru->ru_list);
+ mempool_free(ru, region_user_pool);
+ }
/* gone again: lc->sync_count--;*/
log_clear_bit(lc, lc->sync_bits, lr->u.lr_region);
}
More information about the Cluster-devel
mailing list