[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]

[Cluster-devel] Compilation problem with GFS/GNBD and kernel panics on stress.



Hello all,

I'm trying to use GFS with Fedora Core 4. It was upgraded to a kernel 2.6.16-1.2111_FC4smp. RPM versions are:
GFS-kernel-smp-2.6.11.8-20050601.152643.FC4.25
GFS-6.1.0-3
GFS-kernheaders-2.6.11.8-20050601.152643.FC4.25
dlm-kernheaders-2.6.11.5-20050601.152643.FC4.22
dlm-kernel-smp-2.6.11.5-20050601.152643.FC4.22
dlm-1.0.0-3
gnbd-kernheaders-2.6.11.2-20050420.133124.FC4.58
gnbd-1.0.0-1


There was a problem to install the following packages,and the following patches were necessary:

-GFS-kernel


--- gfs-kernel-2.6.11.8-20050601.152643.FC4/src/gfs/ops_file.c.orig	2006-06-01 13:57:58.000000000 +0200
+++ gfs-kernel-2.6.11.8-20050601.152643.FC4/src/gfs/ops_file.c	2006-06-01 13:57:24.000000000 +0200
@@ -931,12 +931,12 @@
	if (!access_ok(VERIFY_READ, buf, size))
		return -EFAULT;

-	down(&inode->i_sem);
+	mutex_lock(&inode->i_mutex);
	if (file->f_flags & O_DIRECT)
		count = walk_vm(file, (char *)buf, size, offset, do_write_direct);
	else
		count = walk_vm(file, (char *)buf, size, offset, do_write_buf);
-	up(&inode->i_sem);
+	mutex_unlock(&inode->i_mutex);

	return count;
}
--- gfs-kernel-2.6.11.8-20050601.152643.FC4/src/gfs/ops_fstype.c.orig	2006-06-01 14:04:16.000000000 +0200
+++ gfs-kernel-2.6.11.8-20050601.152643.FC4/src/gfs/ops_fstype.c	2006-06-01 14:05:29.000000000 +0200
@@ -712,12 +712,12 @@
		goto out;
	} else {
		char buf[BDEVNAME_SIZE];
-
+		unsigned long bsize;
		sb->s_flags = flags;
		strlcpy(sb->s_id, bdevname(real, buf), sizeof(sb->s_id));
-		sb->s_old_blocksize = block_size(real);
-		sb_set_blocksize(sb, sb->s_old_blocksize);
-		set_blocksize(real, sb->s_old_blocksize);
+		bsize = block_size(real);
+		sb_set_blocksize(sb, bsize);
+		set_blocksize(real, bsize);
		error = fill_super(sb, data, (flags & MS_VERBOSE) ? 1 : 0);
		if (error) {
			up_write(&sb->s_umount);
@@ -748,7 +748,7 @@
{
	struct block_device *diaper = sb->s_bdev;
	struct block_device *real = gfs_diaper_2real(diaper);
-	unsigned long bsize = sb->s_old_blocksize;
+	unsigned long bsize = block_size(real);

	generic_shutdown_super(sb);
	set_blocksize(diaper, bsize);



I am quite confident about "file_ops.c" as it looks like the latest version for 2.6.15:
http://sources.redhat.com/cgi-bin/cvsweb.cgi/cluster/gfs-kernel/src/gfs/ops_file.c?rev=1.16.6.2.2.4&content-type=text/x-cvsweb-markup&cvsroot=cluster&only_with_tag=gfs-kernel_2_6_15_2

For "ops_fstype.c", it should be ok, unless you see obvious errors.


- gnbd-kernel:

--- gnbd-kernel-2.6.11.2-20050420.133124/src/gnbd.c.orig	2006-06-01 13:46:35.000000000 +0200
+++ gnbd-kernel-2.6.11.2-20050420.133124/src/gnbd.c	2006-06-01 13:47:03.000000000 +0200
@@ -180,9 +180,9 @@
	set_capacity(dev->disk, size);
	bdev = bdget_disk(dev->disk, 0);
	if (bdev) {
-		down(&bdev->bd_inode->i_sem);
+		mutex_lock(&bdev->bd_inode->i_mutex);
		i_size_write(bdev->bd_inode, (loff_t)size << 9);
-		up(&bdev->bd_inode->i_sem);
+		mutex_unlock(&bdev->bd_inode->i_mutex);
		bdput(bdev);
	}
	up(&dev->do_it_lock);
@@ -281,7 +281,7 @@
	
	spin_lock_irqsave(q->queue_lock, flags);
	if (!end_that_request_first(req, uptodate, req->nr_sectors)) {
-		end_that_request_last(req);
+		end_that_request_last(req, 0);
	}
	spin_unlock_irqrestore(q->queue_lock, flags);
}


This one is quite straightforward.


Once compiled and run, i get 1 node running GNBD and exporting one of its disks. 3 other nodes are running as client for GNBD, and i mount a GFS on them, although all 4 nodes participate to a GFS cluster. (standard config : dlm, cman)

I have tried to loop 100 times over parallel "bonnie++" on the 3 nodes, with:
bonnie++ -u 0:0 -d /mnt/gfs -x 100

One of the nodes crashed before the end before the 10th loop, with the following panic:

Unable to handle kernel paging request at 0000000000200220 RIP:
^M<ffffffff88351d6a>{:gfs:gfs_depend_add+430}
^MPGD 306d7067 PUD 37532067 PMD 0
^MOops: 0000 [1] SMP
^Mlast sysfs file: /class/gnbd/gnbd0/waittime
^MCPU 1
^MModules linked in: gnbd(U) lock_dlm(U) dlm(U) gfs(U) lock_harness(U) cman(U)
ipv6 parport_pc lp parport autofs4 rfcomm l2cap bluetooth sunrpc pcmcia yent
a_socket rsrc_nonstatic pcmcia_core dm_mod video button battery ac uhci_hcd
ehci_hcd i2c_i801 i2c_core tg3 e1000 ext3 jbd ata_piix libata sd_mod scsi_mod
^MPid: 5679, comm: bonnie++ Tainted: GF     2.6.16-1.2111_FC4smp #1
^MRIP: 0010:[<ffffffff88351d6a>] <ffffffff88351d6a>{:gfs:gfs_depend_add+430}
^MRSP: 0018:ffff81002bfddb38  EFLAGS: 00010206
^MRAX: ffff810037571200 RBX: 0000000000003a98 RCX: 0000000000000002
^MRDX: ffff810037571338 RSI: ffff81002bfddb08 RDI: ffff810001dd5c40
^MRBP: ffffc2001017a000 R08: ffffc2001017c650 R09: 0000000000000040
^MR10: 0000000000000040 R11: 0000000000040000 R12: 0000000000003a98
^MR13: 00000001002ac770 R14: 00000000002001f0 R15: ffffc2001017a258
^MFS:  00002aaaaaab8380(0000) GS:ffff8100021d9f40(0000) knlGS:0000000000000000
^MCS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
^MCR2: 0000000000200220 CR3: 0000000035b0b000 CR4: 00000000000006e0
^MProcess bonnie++ (pid: 5679, threadinfo ffff81002bfdc000, task ffff81003ecd5860)
^MStack: ffff810037571200 000000018832af2b 0000000000d633e7 ffff810006d384a8
^M       ffff810022a0d978 0000000000d633e8 ffffc2001017a000 0000000000000001
^M       ffff810009bd4490 ffffffff8832b99b
^MCall Trace: <ffffffff8832b99b>{:gfs:gfs_wipe_buffers+842}
^M       <ffffffff8833a292>{:gfs:gfs_inode_dealloc+1023}
<ffffffff88356102>{:gfs:gfs_unlinked_limit+230}
^M       <ffffffff8834aaac>{:gfs:gfs_unlink+60}
<ffffffff8834b183>{:gfs:gfs_permission+483}
^M       <ffffffff8019080f>{permission+114} <ffffffff80190a39>{vfs_unlink+203}
^M       <ffffffff8019312d>{do_unlinkat+184}
<ffffffff8010d431>{syscall_trace_enter+181}
^M       <ffffffff8010ab11>{tracesys+113} <ffffffff8010ab71>{tracesys+209}

^MCode: 4d 8b 66 30 4c 89 ff e8 34 04 00 f8 8b 9d 94 02 00 00 4c 89
^MRIP <ffffffff88351d6a>{:gfs:gfs_depend_add+430} RSP <ffff81002bfddb38>
^MCR2: 0000000000200220
^M <0>Kernel panic - not syncing: Oops

^MCall Trace: <ffffffff80134f76>{panic+133}
<ffffffff803521fb>{_spin_unlock_irqrestore+11}
^M       <ffffffff8035293c>{oops_end+71} <ffffffff803543ba>{do_page_fault+1770}
^M       <ffffffff8017dfc1>{kmem_freepages+191} <ffffffff8017e2e7>{slab_destroy+151}
^M       <ffffffff8010b93d>{error_exit+0}
<ffffffff88351d6a>{:gfs:gfs_depend_add+430}
^M       <ffffffff88351da4>{:gfs:gfs_depend_add+488}
<ffffffff8832b99b>{:gfs:gfs_wipe_buffers+842}
^M       <ffffffff8833a292>{:gfs:gfs_inode_dealloc+1023}
<ffffffff88356102>{:gfs:gfs_unlinked_limit+230}
^M       <ffffffff8834aaac>{:gfs:gfs_unlink+60}
<ffffffff8834b183>{:gfs:gfs_permission+483}
^M       <ffffffff8019080f>{permission+114} <ffffffff80190a39>{vfs_unlink+203}
^M       <ffffffff8019312d>{do_unlinkat+184}
<ffffffff8010d431>{syscall_trace_enter+181}
^M       <ffffffff8010ab11>{tracesys+113} <ffffffff8010ab71>{tracesys+209}


Any thoughts on this ? Maybe it has already been corrected in a more recent version ?

--
Mathieu Avila


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]