[Cluster-devel] cluster/cmirror-kernel/src dm-cmirror-client.c ...
jbrassow at sourceware.org
jbrassow at sourceware.org
Tue Apr 10 07:13:18 UTC 2007
CVSROOT: /cvs/cluster
Module name: cluster
Branch: RHEL45
Changes by: jbrassow at sourceware.org 2007-04-10 08:13:15
Modified files:
cmirror-kernel/src: dm-cmirror-client.c dm-cmirror-common.h
dm-cmirror-server.c dm-cmirror-server.h
Log message:
Bug 235686: Kernel BUG at dm_cmirror_server while recovering region
Several fixes have gone in to fix the handling of this bug:
1) During server relocation (which can happen due to machine failure
or normal mirror suspension), the server value could get set before
the client had a chance to clean-up. This caused the server to
become confused and issue a BUG().
2) perform a flush of the log before suspending. This ensures
that regions which are in-sync get correctly flushed to the disk
log. Without this, there will always be recovery work to be done
when a mirror starts up - even if it was properly in-sync during
shutdown.
3) clean-up memory used to record region users when a mirror is
shutdown. It was possible for some regions to be left over
(causing a memory leak) during certain fault scenarios.
4) properly initialize the state field (ru_rw) in the region
user structure when a mark occurs. Without the initialization,
it was sometimes possible for the region to be misinterpretted
as recovering instead of marked.
5) resolve and unhandled case in server_complete_resync_work
6) reset a variable in cluster_complete_resync_work. Failure
to do so was causing a retry to include the wrong value for
the completion of the resync work - confusing the server.
Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-client.c.diff?cvsroot=cluster&only_with_tag=RHEL45&r1=1.1.2.41.2.3&r2=1.1.2.41.2.4
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-common.h.diff?cvsroot=cluster&only_with_tag=RHEL45&r1=1.1.2.12.2.1&r2=1.1.2.12.2.2
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-server.c.diff?cvsroot=cluster&only_with_tag=RHEL45&r1=1.1.2.26.2.4&r2=1.1.2.26.2.5
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-server.h.diff?cvsroot=cluster&only_with_tag=RHEL45&r1=1.1.2.2&r2=1.1.2.2.8.1
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c 2007/04/05 21:33:36 1.1.2.41.2.3
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c 2007/04/10 07:13:15 1.1.2.41.2.4
@@ -373,13 +373,8 @@
fs = get_fs();
set_fs(get_ds());
- if(type == LRT_MASTER_LEAVING){
- len = my_recvmsg(lc->client_sock, &msg, sizeof(struct log_request),
- 0, 10);
- } else {
- len = my_recvmsg(lc->client_sock, &msg, sizeof(struct log_request),
- 0, 5);
- }
+ len = my_recvmsg(lc->client_sock, &msg,
+ sizeof(struct log_request), 0, 15);
set_fs(fs);
if(len <= 0){
@@ -492,9 +487,14 @@
goto out;
}
election:
- while(lc->server_id == 0xDEAD){
+ while ((lc->server_id == 0xDEAD) || (!lc->server_valid)) {
+ DMDEBUG("server_id=%x, server_valid=%u, %s",
+ lc->server_id, lc->server_valid,
+ lc->uuid + (strlen(lc->uuid) - 8));
+ DMDEBUG("trigger = %s", RQ_STRING(type));
run_election(lc, my_id);
new_server = 1;
+ lc->server_valid = 1;
}
spin_lock(&lc->state_lock);
@@ -600,6 +600,7 @@
INIT_LIST_HEAD(&lc->mark_logged);
spin_lock_init(&lc->state_lock);
+ lc->server_valid = 0;
lc->server_id = 0xDEAD;
if ((error = cluster_connect())) {
@@ -731,19 +732,20 @@
}
if (!list_empty(&lc->mark_logged)) {
- DMERR("Mark requests remain at cluster log deactivation");
- /*
- * Should I BUG() this?
- * No. In the worst case, they will get cleaned up later
- */
- }
- list_for_each_entry_safe(rs, tmp_rs, &lc->mark_logged, rs_list) {
- list_del_init(&rs->rs_list);
- mempool_free(rs, region_state_pool);
+ int i = 0;
+
+ list_for_each_entry_safe(rs, tmp_rs, &lc->mark_logged, rs_list) {
+ i++;
+ list_del_init(&rs->rs_list);
+ mempool_free(rs, region_state_pool);
+ }
+ DMDEBUG("%d mark requests remain at cluster log deactivation", i);
}
spin_unlock(&lc->state_lock);
+ server_free_region_users(lc);
+
if (lc->log_dev)
disk_dtr(log);
else
@@ -753,8 +755,13 @@
DMERR("Unable to disconnect from cluster infrastructure.\n");
}
+static int cluster_flush(struct dirty_log *log);
static int cluster_presuspend(struct dirty_log *log)
{
+ /* FIXME: flush is work-around for bug 235040 */
+ DMDEBUG("Performing flush to work around bug 235040");
+ cluster_flush(log);
+ DMDEBUG("Log flush complete");
return 0;
}
@@ -784,6 +791,7 @@
atomic_set(&lc->suspended, 1);
if(lc->server_id == my_id) {
while (1) {
+ DMDEBUG("Telling everyone I'm suspending");
consult_server(lc, 0, LRT_MASTER_LEAVING, NULL);
down(&consult_server_lock);
@@ -791,11 +799,13 @@
up(&consult_server_lock);
if ((my_id && (lc->server_id == my_id))) {
+ DMDEBUG("Delaying suspend, work to be done.");
atomic_set(&lc->suspended, 0);
set_current_state(TASK_INTERRUPTIBLE);
schedule_timeout(HZ*2);
atomic_set(&lc->suspended, 1);
} else {
+ DMDEBUG("Suspending now");
break;
}
}
@@ -907,7 +917,7 @@
DMDEBUG("Delaying mark to region %Lu, due to recovery",
rs->rs_region);
set_current_state(TASK_INTERRUPTIBLE);
- schedule_timeout(HZ/2);
+ schedule_timeout(HZ);
continue;
}
@@ -933,7 +943,7 @@
if (r == -EBUSY) {
DMDEBUG("Delaying flush due to recovery");
set_current_state(TASK_INTERRUPTIBLE);
- schedule_timeout(HZ/2);
+ schedule_timeout(HZ);
continue;
}
@@ -1109,6 +1119,10 @@
rtn = 0;
}
+ if (rtn)
+ DMDEBUG("Client received resync work: %Lu/%s",
+ *region, lc->uuid + (strlen(lc->uuid) - 8));
+
/*
* Check for bug 235039
* Note the changes in cluser_clear_region
@@ -1129,10 +1143,16 @@
static void cluster_complete_resync_work(struct dirty_log *log,
region_t region, int success)
{
+ int i;
region_t success_tmp = success;
struct log_c *lc = (struct log_c *) log->context;
- while(consult_server(lc, region, LRT_COMPLETE_RESYNC_WORK, &success_tmp)){
- DMWARN("unable to notify server of completed resync work");
+
+ for (i = 0; i < 5; i++) {
+ if (!consult_server(lc, region,
+ LRT_COMPLETE_RESYNC_WORK, &success_tmp))
+ break;
+ success_tmp = success;
+ DMWARN("Unable to notify server of completed resync work");
}
return;
}
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-common.h 2007/04/03 18:23:01 1.1.2.12.2.1
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-common.h 2007/04/10 07:13:15 1.1.2.12.2.2
@@ -141,6 +141,7 @@
struct list_head mark_waiting;
struct list_head mark_logged;
+ uint32_t server_valid;
uint32_t server_id;
struct socket *client_sock;
};
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c 2007/04/05 21:33:36 1.1.2.26.2.4
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c 2007/04/10 07:13:15 1.1.2.26.2.5
@@ -229,7 +229,8 @@
int sync_search, conflict = 0;
if (lc->recovering_region != (uint64_t)-1) {
- DMDEBUG("Someone is already recovering region %Lu", lc->recovering_region);
+ DMDEBUG("Someone is already recovering region %Lu/%s",
+ lc->recovering_region, lc->uuid + (strlen(lc->uuid) - 8));
return 0;
}
@@ -253,8 +254,8 @@
sync_search);
if (find_ru_by_region(lc, *region)) {
conflict = 1;
- DMDEBUG("Recovery blocked by outstanding write on region %Lu",
- *region);
+ DMDEBUG("Recovery blocked by outstanding write on region %Lu/%s",
+ *region, lc->uuid + (strlen(lc->uuid) - 8));
} else {
break;
}
@@ -534,7 +535,8 @@
new->ru_nodeid = who;
new->ru_region = lr->u.lr_region;
-
+ new->ru_rw = RU_WRITE;
+
if (!(ru = find_ru_by_region(lc, lr->u.lr_region))) {
log_clear_bit(lc, lc->clean_bits, lr->u.lr_region);
list_add(&new->ru_list, &lc->region_users);
@@ -615,6 +617,11 @@
DMDEBUG(" %s", (ru2->ru_rw == RU_RECOVER) ? "recover" :
(ru2->ru_rw == RU_WRITE) ? "writer" : "unknown");
+ /* FIXME: work-around for bug 235040 */
+ DMDEBUG("Revoking resync work");
+ lc->recovering_region = (uint64_t)-1;
+ list_del(&ru->ru_list);
+ mempool_free(ru, region_user_pool);
return -EBUSY;
}
}
@@ -646,7 +653,8 @@
new->ru_region = lr->u.lr_region_rtn;
new->ru_rw = RU_RECOVER;
list_add(&new->ru_list, &lc->region_users);
- DMDEBUG("Assigning recovery work to %u: %Lu", who, new->ru_region);
+ DMDEBUG("Assigning recovery work to %u/%s: %Lu",
+ who, lc->uuid + (strlen(lc->uuid) - 8), new->ru_region);
} else {
mempool_free(new, region_user_pool);
}
@@ -654,8 +662,8 @@
return 0;
}
-
-static int server_complete_resync_work(struct log_c *lc, struct log_request *lr, int success){
+static int server_complete_resync_work(struct log_c *lc, struct log_request *lr,
+ int success, uint32_t who){
struct region_user *ru;
if (lr->u.lr_region > lc->region_count) {
@@ -679,51 +687,61 @@
lc->sync_pass = 0;
DMDEBUG("Resync work completed: %Lu", lr->u.lr_region);
- } else if (log_test_bit(lc->sync_bits, lr->u.lr_region)) {
- ru = find_ru_by_region(lc, lr->u.lr_region);
+ return 0;
+ }
- /*
- * The following condition can never happen unless we have
- * a corrupted list or we had a communication error.
- *
- * If a write failed to one of the mirror devices, the ru
- * should be RU_WRITE. If a recovery failed, it should be
- * RU_RECOVER
- */
- if (!ru) {
- DMERR("Unable to find region being marked out-of-sync: %Lu",
- lr->u.lr_region);
- return -EINVAL;
- }
+ /*
+ * Recovery failed or mirror is being marked out-of-sync
+ *
+ * We can recieve multiple calls to mark out-of-sync
+ * if there were several writes to the same region that
+ * failed. In this case, there will not be a record for
+ * the region.
+ */
+ ru = find_ru(lc, who, lr->u.lr_region);
- if (ru->ru_rw == RU_RECOVER) {
- if (lr->u.lr_region != lc->recovering_region) {
- DMERR("Recovering region mismatch: (%Lu/%Lu)",
- lr->u.lr_region, lc->recovering_region);
- BUG();
- }
- /*
- * Clear the recovery
- */
- lc->recovering_region = (uint64_t)-1;
- list_del(&ru->ru_list);
- mempool_free(ru, region_user_pool);
- } else { /* ru->ru_rw == RU_WRITE */
- /*
- * Mirror has place the region into RH_NOSYNC
- * It is safe to pull the ru
- */
- list_del(&ru->ru_list);
- mempool_free(ru, region_user_pool);
+ if ((lr->u.lr_region == lc->recovering_region) && !ru) {
+ DMERR("Unable to locate record of recovery");
+ BUG();
+ }
+
+ if (!ru) {
+ DMERR("Unable to find region to be marked out-of-sync: %Lu/%s/%u",
+ lr->u.lr_region, lc->uuid + (strlen(lc->uuid) - 8), who);
+ BUG();
+ } else if (ru->ru_rw == RU_RECOVER) {
+ if (lr->u.lr_region != lc->recovering_region) {
+ DMERR("Recovering region mismatch from node %u: (%Lu/%Lu)",
+ who, lr->u.lr_region, lc->recovering_region);
+ BUG();
}
- /* gone again: lc->sync_count--;*/
- log_clear_bit(lc, lc->sync_bits, lr->u.lr_region);
+ /*
+ * Clear the recovery
+ */
+ DMDEBUG("Setting recovering region out-of-sync: %Lu/%s/%u",
+ lr->u.lr_region, lc->uuid + (strlen(lc->uuid) - 8), who);
+ lc->recovering_region = (uint64_t)-1;
+ list_del(&ru->ru_list);
+ mempool_free(ru, region_user_pool);
}
+ /* else if (ru->ru_rw == RU_WRITE)
+ * Mirror has place the region into RH_NOSYNC
+ * We will leave the record in place. It is
+ * likely there are more requests coming to
+ * mark this region out-of-sync, due to the
+ * way mirror handles the situation.
+ *
+ DMDEBUG("Setting marked region out-of-sync: %Lu/%s",
+ lr->u.lr_region, lc->uuid + (strlen(lc->uuid) - 8));
+ */
+
+ /* gone again: lc->sync_count--;*/
+ log_clear_bit(lc, lc->sync_bits, lr->u.lr_region);
+
return 0;
}
-
static int server_get_sync_count(struct log_c *lc, struct log_request *lr){
if (lc->sync_count > lc->region_count) {
DMERR("sync_count (" SECTOR_FORMAT ") > region_count (" SECTOR_FORMAT ") in %s!",
@@ -901,6 +919,7 @@
* If we are the server, assign it
*/
if(lr->u.lr_coordinator == my_id){
+ lc->server_valid = 0;
lc->server_id = my_id;
}
@@ -988,7 +1007,8 @@
DMDEBUG("I'm the cluster mirror log server for %s",
lc->uuid + (strlen(lc->uuid) - 8));
if (atomic_read(&lc->suspended)) {
- DMDEBUG("Not reading disk log because I'm suspended.");
+ DMDEBUG("Not reading disk log because I'm suspended (%s)",
+ lc->uuid + (strlen(lc->uuid) - 8));
} else if (disk_resume(lc) == -EDEADLK) {
DMDEBUG("Unable to read disk log - deadlock potential.");
@@ -1060,7 +1080,12 @@
error = server_get_resync_work(lc, lr, nodeid);
break;
case LRT_COMPLETE_RESYNC_WORK:
- error = server_complete_resync_work(lc, lr, lr->u.lr_int_rtn);
+ if(!(nodeid =
+ ipaddr_to_nodeid((struct sockaddr *)msg.msg_name))){
+ error = -ENXIO;
+ break;
+ }
+ error = server_complete_resync_work(lc, lr, lr->u.lr_int_rtn, nodeid);
lr->u.lr_int_rtn = 0;
break;
case LRT_GET_SYNC_COUNT:
@@ -1236,6 +1261,22 @@
}
*/
+int server_free_region_users(struct log_c *lc)
+{
+ int i = 0;
+ struct region_user *ru, *tmp_ru;
+
+ list_for_each_entry_safe(ru, tmp_ru, &lc->region_users, ru_list) {
+ i++;
+ list_del(&ru->ru_list);
+ mempool_free(ru, region_user_pool);
+ }
+
+ DMDEBUG("%d region user structures freed", i);
+ return 0;
+}
+
+
int suspend_server(void){
atomic_set(&_suspend, 1);
return 0;
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-server.h 2005/08/11 18:26:19 1.1.2.2
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-server.h 2007/04/10 07:13:15 1.1.2.2.8.1
@@ -13,5 +13,6 @@
int start_server(void);
void stop_server(void);
void print_server_status(struct log_c *lc);
+int server_free_region_users(struct log_c *lc);
#endif /* __DM_CMIRROR_SERVER_H__ */
More information about the Cluster-devel
mailing list