[Cluster-devel] cluster/cmirror-kernel/src dm-cmirror-client.c ...
jbrassow at sourceware.org
jbrassow at sourceware.org
Tue Sep 5 17:50:12 UTC 2006
CVSROOT: /cvs/cluster
Module name: cluster
Branch: RHEL4
Changes by: jbrassow at sourceware.org 2006-09-05 17:50:11
Modified files:
cmirror-kernel/src: dm-cmirror-client.c dm-cmirror-server.c
Log message:
- fix the bugs I've seen so far - mostly related to the recently added
ability to migrate the log server on suspension - that cause hangs
during combinations of create/delete/convert of mirrors
Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-client.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.23&r2=1.1.2.24
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-server.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.16&r2=1.1.2.17
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c 2006/07/27 23:11:55 1.1.2.23
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c 2006/09/05 17:50:11 1.1.2.24
@@ -396,8 +396,8 @@
set_fs(get_ds());
if(type == LRT_MASTER_LEAVING){
- len = sock_recvmsg(lc->client_sock, &msg, sizeof(struct log_request),
- /* WAIT for it */0);
+ len = my_recvmsg(lc->client_sock, &msg, sizeof(struct log_request),
+ 0, 10);
} else {
len = my_recvmsg(lc->client_sock, &msg, sizeof(struct log_request),
0, 5);
@@ -419,7 +419,7 @@
goto fail;
}
- if(lr->u.lr_int_rtn == -ENXIO){
+ if (lr->u.lr_int_rtn == -ENXIO) {
lc->server_id = 0xDEAD;
*retry = 1;
goto fail;
@@ -591,7 +591,7 @@
unsigned int argc, char **argv, int disk)
{
int error = 0;
- struct log_c *lc;
+ struct log_c *lc, *tmp_lc;
struct sockaddr_in saddr_in;
if (!disk) {
@@ -621,6 +621,15 @@
atomic_set(&lc->in_sync, -1);
+ list_for_each_entry(tmp_lc, &log_list_head, log_list){
+ if(!strncmp(tmp_lc->uuid, lc->uuid, MAX_NAME_LEN)){
+ DMERR("Log already exists with uuid, %s",
+ lc->uuid + (strlen(lc->uuid) - 8));
+ error = -EINVAL;
+ goto fail;
+ }
+ }
+
list_add(&lc->log_list, &log_list_head);
INIT_LIST_HEAD(&lc->region_users);
@@ -730,6 +739,7 @@
list_del_init(&lc->log_list);
if ((lc->server_id == my_id) && !atomic_read(&lc->suspended))
consult_server(lc, 0, LRT_MASTER_LEAVING, NULL);
+
sock_release(lc->client_sock);
if (lc->log_dev)
@@ -748,6 +758,7 @@
static int cluster_postsuspend(struct dirty_log *log)
{
+ int r;
struct log_c *lc = (struct log_c *) log->context;
while (1) {
@@ -765,12 +776,16 @@
if(lc->server_id == my_id) {
while (1) {
consult_server(lc, 0, LRT_MASTER_LEAVING, NULL);
+
down(&consult_server_lock);
run_election(lc, 0xDEAD);
up(&consult_server_lock);
- if (lc->server_id == my_id) {
+
+ if ((my_id && (lc->server_id == my_id))) {
+ atomic_set(&lc->suspended, 0);
set_current_state(TASK_INTERRUPTIBLE);
- schedule_timeout(HZ/4);
+ schedule_timeout(HZ*2);
+ atomic_set(&lc->suspended, 1);
} else {
break;
}
@@ -1005,7 +1020,7 @@
if (!success) {
DMERR("Attempting to revert sync status of region #%llu", region);
set_current_state(TASK_INTERRUPTIBLE);
- schedule_timeout(HZ/50);
+ schedule_timeout(HZ/5);
}
return;
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c 2006/07/27 23:11:55 1.1.2.16
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c 2006/09/05 17:50:11 1.1.2.17
@@ -107,7 +107,9 @@
if (!log->log_dev)
return 0;
- BUG_ON(atomic_read(&log->suspended));
+ if (atomic_read(&log->suspended))
+ return -EDEADLK;
+
r = dm_io_sync_vm(1, &log->header_location, READ,
log->disk_header, &ebits);
if (unlikely(r))
@@ -138,7 +140,9 @@
if (!log->log_dev)
return 0;
- BUG_ON(atomic_read(&log->suspended));
+ if (atomic_read(&log->suspended))
+ return -EDEADLK;
+
header_to_disk(&log->header, log->disk_header);
return dm_io_sync_vm(1, &log->header_location, WRITE,
log->disk_header, &ebits);
@@ -182,7 +186,9 @@
if (!log->log_dev)
return 0;
- BUG_ON(atomic_read(&log->suspended));
+ if (atomic_read(&log->suspended))
+ return -EDEADLK;
+
r = dm_io_sync_vm(1, &log->bits_location, READ,
log->clean_bits, &ebits);
@@ -199,7 +205,9 @@
if (!log->log_dev)
return 0;
- BUG_ON(atomic_read(&log->suspended));
+ if (atomic_read(&log->suspended))
+ return -EDEADLK;
+
return dm_io_sync_vm(1, &log->bits_location, WRITE,
log->clean_bits, &ebits);
}
@@ -252,9 +260,9 @@
continue;
} else if(str[i] == 0xFF){
if(range_count==1){
- DMINFO(" %d", region - 1);
+ DMDEBUG(" %d", region - 1);
} else if(range_count){
- DMINFO(" %d - %d", region-range_count, region-1);
+ DMDEBUG(" %d - %d", region-range_count, region-1);
}
range_count = 0;
region+=(bit_count < 8)? bit_count: 8;
@@ -272,9 +280,9 @@
count++;
} else {
if(range_count==1){
- DMINFO(" %d", region - 1);
+ DMDEBUG(" %d", region - 1);
} else if(range_count){
- DMINFO(" %d - %d", region-range_count, region-1);
+ DMDEBUG(" %d - %d", region-range_count, region-1);
}
range_count = 0;
region++;
@@ -283,9 +291,9 @@
}
if(range_count==1){
- DMINFO(" %d", region - 1);
+ DMDEBUG(" %d", region - 1);
} else if(range_count){
- DMINFO(" %d - %d", region-range_count, region);
+ DMDEBUG(" %d - %d", region-range_count, region);
}
return count;
}
@@ -312,7 +320,7 @@
i = 1;
if (!lc->log_dev_failed &&
((r = read_header(lc)) || (i = 0) || (r = read_bits(lc)))) {
- if (r == -EINVAL)
+ if (r == -EINVAL || r == -EDEADLK)
return r;
DMWARN("Read %s failed on mirror log device, %s",
@@ -416,9 +424,11 @@
i = 1;
if ((r = write_bits(lc)) || (i = 0) || (r = write_header(lc))) {
- DMWARN("Write %s failed on mirror log device, %s.",
- i ? "bits" : "header", lc->log_dev->name);
- lc->log_dev_failed = 1;
+ if (r != -EDEADLK) {
+ DMWARN("Write %s failed on mirror log device, %s.",
+ i ? "bits" : "header", lc->log_dev->name);
+ lc->log_dev_failed = 1;
+ }
} else
lc->log_dev_failed = 0;
@@ -469,6 +479,11 @@
static int server_in_sync(struct log_c *lc, struct log_request *lr)
{
+ if (lr->u.lr_region > lc->region_count) {
+ lr->u.lr_int_rtn = 0;
+ return -EINVAL;
+ }
+
if(likely(log_test_bit(lc->sync_bits, lr->u.lr_region)))
/* in-sync */
lr->u.lr_int_rtn = 1;
@@ -581,6 +596,11 @@
static int server_complete_resync_work(struct log_c *lc, struct log_request *lr, int success){
uint32_t info;
+
+ if (lr->u.lr_region > lc->region_count) {
+ return -EINVAL;
+ }
+
log_clear_bit(lc, lc->recovering_bits, lr->u.lr_region);
if (success) {
@@ -678,15 +698,16 @@
/*
* Check if we have access to the log. We may not
- * get have loaded this device.
+ * yet have loaded this device.
*/
- if(!lc){
+ if (!lc) {
lr->u.lr_node_count++;
return 0;
}
if(lr->lr_type == LRT_MASTER_LEAVING){
- lc->server_id = 0xDEAD;
+ if (lr->u.lr_starter == lc->server_id)
+ lc->server_id = 0xDEAD;
lr->u.lr_node_count++;
return 0;
}
@@ -696,7 +717,7 @@
* We shortcut the election here and respond directly
* to the inquirer
*/
- if(lc->server_id == my_id){
+ if((lc->server_id == my_id) && !atomic_read(&lc->suspended)){
lr->u.lr_coordinator = my_id;
if(!(saddr->sin_addr.s_addr = nodeid_to_ipaddr(lr->u.lr_starter))){
return -1;
@@ -850,10 +871,12 @@
if(lc && (old != lc->server_id) && (my_id == lc->server_id)){
DMDEBUG("I'm the cluster mirror log server for %s",
lc->uuid + (strlen(lc->uuid) - 8));
- if (!atomic_read(&lc->suspended))
- disk_resume(lc);
- else
+ if (atomic_read(&lc->suspended)) {
DMDEBUG("Not reading disk log because I'm suspended.");
+
+ } else if (disk_resume(lc) == -EDEADLK) {
+ DMDEBUG("Unable to read disk log - deadlock potential.");
+ }
}
goto reply;
}
@@ -944,7 +967,7 @@
/*
DMWARN("Error (%d) while processing request (%s)",
error,
- (lr.lr_type == LRT_IS_CLEAN)? "LRT_IS_C LEAN":
+ (lr.lr_type == LRT_IS_CLEAN)? "LRT_IS_CLEAN":
(lr.lr_type == LRT_IN_SYNC)? "LRT_IN_SYNC":
(lr.lr_type == LRT_MARK_REGION)? "LRT_MARK_REGION":
(lr.lr_type == LRT_GET_RESYNC_WORK)? "LRT_GET_RESYNC_WORK":
@@ -972,7 +995,18 @@
set_fs(fs);
if(error < 0){
- DMWARN("unable to sendmsg to client (error = %d)", error);
+ DMWARN("unable to sendmsg to client (type = %s, error = %d)",
+ (lr.lr_type == LRT_IS_CLEAN)? "LRT_IS_CLEAN":
+ (lr.lr_type == LRT_IN_SYNC)? "LRT_IN_SYNC":
+ (lr.lr_type == LRT_MARK_REGION)? "LRT_MARK_REGION":
+ (lr.lr_type == LRT_GET_RESYNC_WORK)? "LRT_GET_RESYNC_WORK":
+ (lr.lr_type == LRT_GET_SYNC_COUNT)? "LRT_GET_SYNC_COUNT":
+ (lr.lr_type == LRT_CLEAR_REGION)? "LRT_CLEAR_REGION":
+ (lr.lr_type == LRT_COMPLETE_RESYNC_WORK)? "LRT_COMPLETE_RESYNC_WORK":
+ (lr.lr_type == LRT_MASTER_LEAVING)? "LRT_MASTER_LEAVING":
+ (lr.lr_type == LRT_ELECTION)? "LRT_ELECTION":
+ (lr.lr_type == LRT_SELECTION)? "LRT_SELECTION": "UNKNOWN",
+ error);
return error;
}
} else if(error == -EAGAIN || error == -ETIMEDOUT){
@@ -1036,10 +1070,11 @@
list_for_each_entry(lc, &log_list_head, log_list){
if(lc->server_id == my_id){
- if (!atomic_read(&lc->suspended))
- disk_resume(lc);
- else
+ if (atomic_read(&lc->suspended)) {
DMDEBUG("Not reading disk log because I'm suspended.");
+ } else if (disk_resume(lc) == -EDEADLK) {
+ DMDEBUG("Unable to read disk log - deadlock potential.");
+ }
}
}
break;
More information about the Cluster-devel
mailing list