[Linux-cluster] Cluster logging issues + rgmanager doesn't notice failed vms

Bart Verwilst lists at verwilst.be
Mon Aug 20 12:28:42 UTC 2012


Not sure if it will help, but here is some more debugging output for 
the locking:
vm02 is the bad node, vm03 can reach the mounts fine.

root at vm02-test:~# cat /sys/kernel/debug/gfs2/kvm\:qemu/glocks
G:  s:UN n:2/19 f:o t:UN d:EX/0 a:0 v:0 r:1 m:50
G:  s:SH n:2/183f3 f:Iqob t:SH d:EX/0 a:0 v:0 r:2 m:50
  I: n:19/99315 t:8 f:0x00 d:0x00000201 s:768
G:  s:EX n:9/1 f:Iqb t:EX d:EX/0 a:0 v:0 r:2 m:50
  H: s:EX f:eH e:0 p:8153 [(ended)] gfs2_glock_nq_num+0x5a/0xa0 [gfs2]
G:  s:SH n:5/183f2 f:Iqob t:SH d:EX/0 a:0 v:0 r:2 m:50
  H: s:SH f:EH e:0 p:8153 [(ended)] gfs2_inode_lookup+0x11e/0x2e0 [gfs2]
G:  s:UN n:2/183fa f:lIqob t:SH d:EX/0 a:0 v:0 r:5 m:50
  H: s:SH f:AW e:0 p:31574 [vm.sh] gfs2_getattr+0xb3/0xf0 [gfs2]
  H: s:SH f:AW e:0 p:3054 [ls] gfs2_getattr+0xb3/0xf0 [gfs2]
  H: s:SH f:AW e:0 p:3323 [ls] gfs2_getattr+0xb3/0xf0 [gfs2]
G:  s:SH n:5/181ec f:Iqob t:SH d:EX/0 a:0 v:0 r:2 m:50
  H: s:SH f:EH e:0 p:8153 [(ended)] gfs2_inode_lookup+0x11e/0x2e0 [gfs2]
G:  s:UN n:2/183f4 f:o t:UN d:EX/0 a:0 v:0 r:1 m:50
G:  s:SH n:2/805b f:Iqob t:SH d:EX/0 a:0 v:0 r:3 m:50
  H: s:SH f:eEcH e:0 p:8153 [(ended)] init_inodes+0x3ac/0x5f0 [gfs2]
  I: n:5/32859 t:8 f:0x01 d:0x00000200 s:134217728
G:  s:SH n:5/183f3 f:Iqob t:SH d:EX/0 a:0 v:0 r:2 m:50
  H: s:SH f:EH e:0 p:8153 [(ended)] gfs2_inode_lookup+0x11e/0x2e0 [gfs2]
G:  s:SH n:5/19 f:Iqob t:SH d:EX/0 a:0 v:0 r:2 m:50
  H: s:SH f:EH e:0 p:8153 [(ended)] gfs2_inode_lookup+0x11e/0x2e0 [gfs2]
G:  s:UN n:3/4000c f:o t:UN d:EX/0 a:0 v:0 r:1 m:50
G:  s:SH n:2/180e7 f:IqLb t:SH d:EX/0 a:0 v:0 r:1 m:50
G:  s:SH n:5/20017 f:DIqob t:SH d:UN/138019392000 a:0 v:0 r:3 m:50
  H: s:SH f:EH e:0 p:12159 [(ended)] gfs2_inode_lookup+0x11e/0x2e0 
[gfs2]
G:  s:SH n:1/2 f:Iqb t:SH d:EX/0 a:0 v:0 r:2 m:50
G:  s:UN n:3/50009 f:o t:UN d:EX/0 a:0 v:0 r:1 m:50
G:  s:SH n:5/18 f:Iqob t:SH d:EX/0 a:0 v:0 r:2 m:50
  H: s:SH f:EH e:0 p:8153 [(ended)] gfs2_inode_lookup+0x11e/0x2e0 [gfs2]
G:  s:SH n:5/180e7 f:IqLb t:SH d:EX/0 a:0 v:0 r:1 m:50
G:  s:SH n:5/181ed f:Iqob t:SH d:EX/0 a:0 v:0 r:2 m:50
  H: s:SH f:EH e:0 p:8153 [(ended)] gfs2_inode_lookup+0x11e/0x2e0 [gfs2]
G:  s:UN n:3/11 f:qo t:UN d:EX/0 a:0 v:0 r:1 m:44
G:  s:UN n:3/20012 f:lIqob t:SH d:EX/0 a:0 v:0 r:3 m:50
  H: s:SH f:W e:0 p:3117 [kworker/1:0] gfs2_check_blk_type+0x41/0x150 
[gfs2]
G:  s:SH n:4/0 f:IqLb t:SH d:EX/0 a:0 v:0 r:1 m:50
G:  s:UN n:3/70003 f:o t:UN d:EX/0 a:0 v:0 r:1 m:50
G:  s:EX n:2/20017 f:Iqob t:EX d:EX/0 a:0 v:0 r:3 m:50
  H: s:EX f:H e:0 p:3117 [kworker/1:0] gfs2_evict_inode+0x102/0x360 
[gfs2]
  I: n:8/131095 t:8 f:0x01 d:0x00000000 s:2094
G:  s:SH n:5/16 f:Iqob t:SH d:EX/0 a:0 v:0 r:2 m:50
  H: s:SH f:EH e:0 p:8153 [(ended)] gfs2_inode_lookup+0x11e/0x2e0 [gfs2]
G:  s:UN n:3/10015 f:IqLo t:UN d:EX/0 a:0 v:0 r:1 m:50
G:  s:SH n:5/805b f:Iqob t:SH d:EX/0 a:0 v:0 r:2 m:50
  H: s:SH f:EH e:0 p:8153 [(ended)] gfs2_inode_lookup+0x11e/0x2e0 [gfs2]
G:  s:SH n:1/1 f:Iqb t:SH d:EX/0 a:0 v:0 r:2 m:50
  H: s:SH f:eEH e:0 p:8153 [(ended)] gfs2_glock_nq_num+0x5a/0xa0 [gfs2]
G:  s:EX n:2/181ec f:yIqob t:EX d:EX/0 a:1 v:0 r:3 m:50
  H: s:EX f:H e:0 p:8153 [(ended)] fill_super+0x94f/0xc80 [gfs2]
  I: n:12/98796 t:8 f:0x00 d:0x00000201 s:24
G:  s:UN n:1/3 f: t:UN d:EX/0 a:0 v:0 r:1 m:50
G:  s:SH n:5/183f4 f:Iqob t:SH d:EX/0 a:0 v:0 r:2 m:50
  H: s:SH f:EH e:0 p:8153 [(ended)] gfs2_inode_lookup+0x11e/0x2e0 [gfs2]
G:  s:SH n:2/18 f:Iqob t:SH d:EX/0 a:0 v:0 r:2 m:50
  I: n:3/24 t:4 f:0x00 d:0x00000201 s:3864
G:  s:UN n:3/60006 f:o t:UN d:EX/0 a:0 v:0 r:1 m:50
G:  s:UN n:2/100a5 f:o t:UN d:EX/0 a:0 v:0 r:1 m:50
G:  s:SH n:2/16 f:IqLob t:SH d:EX/0 a:0 v:0 r:2 m:50
  I: n:1/22 t:4 f:0x00 d:0x00000001 s:3864
G:  s:UN n:2/3001b f:lIqob t:SH d:EX/0 a:0 v:0 r:3 m:50
  H: s:SH f:AW e:0 p:31854 [vm.sh] gfs2_getattr+0xb3/0xf0 [gfs2]
G:  s:SH n:5/17 f:Iqob t:SH d:EX/0 a:0 v:0 r:2 m:50
  H: s:SH f:EH e:0 p:8153 [(ended)] gfs2_inode_lookup+0x11e/0x2e0 [gfs2]
G:  s:EX n:2/183f8 f:IqLb t:EX d:EX/0 a:0 v:0 r:1 m:50
G:  s:UN n:2/183f2 f:IqLo t:UN d:EX/0 a:0 v:0 r:1 m:50
G:  s:SH n:2/17 f:IqLob t:SH d:EX/0 a:0 v:0 r:2 m:50
  I: n:2/23 t:4 f:0x00 d:0x00000201 s:3864
G:  s:SH n:5/100a5 f:Iqob t:SH d:EX/0 a:0 v:0 r:2 m:50
  H: s:SH f:EH e:0 p:8153 [(ended)] gfs2_inode_lookup+0x11e/0x2e0 [gfs2]
G:  s:UN n:3/3000f f:o t:UN d:EX/0 a:0 v:0 r:1 m:50
G:  s:SH n:5/3001b f:Iqob t:SH d:EX/0 a:0 v:0 r:2 m:50
  H: s:SH f:EH e:0 p:2278 [(ended)] gfs2_inode_lookup+0x11e/0x2e0 [gfs2]
G:  s:SH n:5/183fa f:Iqob t:SH d:EX/0 a:0 v:0 r:2 m:50
  H: s:SH f:EH e:0 p:13012 [(ended)] gfs2_inode_lookup+0x11e/0x2e0 
[gfs2]
G:  s:EX n:2/181ed f:Iqob t:EX d:EX/0 a:0 v:0 r:3 m:50
  H: s:EX f:H e:0 p:8153 [(ended)] fill_super+0x991/0xc80 [gfs2]
  I: n:13/98797 t:8 f:0x00 d:0x00000200 s:1048576

root at vm03-test:~# cat /sys/kernel/debug/gfs2/kvm\:qemu/glocks
G:  s:EX n:2/30018 f:IqL t:EX d:EX/0 a:0 v:0 r:1 m:50
G:  s:SH n:5/183f3 f:Iqo t:SH d:EX/0 a:0 v:0 r:2 m:50
  H: s:SH f:EH e:0 p:5787 [(ended)] gfs2_inode_lookup+0x11e/0x2f0 [gfs2]
G:  s:EX n:2/30014 f:IqL t:EX d:EX/0 a:0 v:0 r:1 m:50
G:  s:SH n:2/100a5 f:Iqo t:SH d:EX/0 a:0 v:0 r:3 m:50
  H: s:SH f:eEcH e:0 p:5787 [(ended)] init_journal+0x184/0x540 [gfs2]
  I: n:6/65701 t:8 f:0x01 d:0x00000200 s:134217728
G:  s:SH n:5/183fa f:Iqo t:SH d:EX/0 a:0 v:0 r:2 m:50
  H: s:SH f:EH e:0 p:14727 [libvirtd] gfs2_inode_lookup+0x11e/0x2f0 
[gfs2]
G:  s:EX n:3/10015 f:yIqo t:EX d:EX/0 a:2 v:0 r:2 m:10
  R: n:65557 f:30000000 b:31772/31772 i:16
G:  s:UN n:3/50009 f:o t:UN d:EX/0 a:0 v:0 r:1 m:50
G:  s:SH n:5/183f2 f:Iqo t:SH d:EX/0 a:0 v:0 r:2 m:50
  H: s:SH f:EH e:0 p:5787 [(ended)] gfs2_inode_lookup+0x11e/0x2f0 [gfs2]
G:  s:UN n:2/183f2 f:IqLo t:UN d:EX/0 a:0 v:0 r:1 m:50
G:  s:EX n:2/183f5 f:IqL t:EX d:EX/0 a:0 v:0 r:1 m:50
G:  s:EX n:2/182f0 f:Iqo t:EX d:EX/0 a:0 v:0 r:3 m:50
  H: s:EX f:H e:0 p:5787 [(ended)] init_per_node+0x1a8/0x270 [gfs2]
  I: n:16/99056 t:8 f:0x00 d:0x00000200 s:1048576
G:  s:EX n:2/20018 f:yIqL t:EX d:EX/0 a:0 v:0 r:1 m:50
G:  s:EX n:9/2 f:Iq t:EX d:EX/0 a:0 v:0 r:2 m:50
  H: s:EX f:eH e:0 p:5787 [(ended)] gfs2_glock_nq_num+0x59/0xa0 [gfs2]
G:  s:EX n:2/2001a f:yIqL t:EX d:EX/0 a:0 v:0 r:1 m:50
G:  s:SH n:1/2 f:Iq t:SH d:EX/0 a:0 v:0 r:2 m:50
G:  s:SH n:5/19 f:Iqo t:SH d:EX/0 a:0 v:0 r:2 m:50
  H: s:SH f:EH e:0 p:5787 [(ended)] gfs2_inode_lookup+0x11e/0x2f0 [gfs2]
G:  s:EX n:2/30019 f:IqL t:EX d:EX/0 a:0 v:0 r:1 m:50
G:  s:EX n:2/182ef f:yIqo t:EX d:EX/0 a:1 v:0 r:3 m:50
  H: s:EX f:H e:0 p:5787 [(ended)] init_per_node+0x175/0x270 [gfs2]
  I: n:15/99055 t:8 f:0x00 d:0x00000201 s:24
G:  s:SH n:5/182f0 f:Iqo t:SH d:EX/0 a:0 v:0 r:2 m:50
  H: s:SH f:EH e:0 p:5787 [(ended)] gfs2_inode_lookup+0x11e/0x2f0 [gfs2]
G:  s:EX n:2/20019 f:yIqL t:EX d:EX/0 a:0 v:0 r:1 m:50
G:  s:SH n:5/3001b f:Iqo t:SH d:EX/0 a:0 v:0 r:2 m:50
  H: s:SH f:EH e:0 p:4569 [(ended)] gfs2_inode_lookup+0x11e/0x2f0 [gfs2]
G:  s:SH n:5/16 f:Iqo t:SH d:EX/0 a:0 v:0 r:2 m:50
  H: s:SH f:EH e:0 p:5787 [(ended)] gfs2_inode_lookup+0x11e/0x2f0 [gfs2]
G:  s:SH n:5/100a5 f:Iqo t:SH d:EX/0 a:0 v:0 r:2 m:50
  H: s:SH f:EH e:0 p:5787 [(ended)] gfs2_inode_lookup+0x11e/0x2f0 [gfs2]
G:  s:SH n:5/180e7 f:IqL t:SH d:EX/0 a:0 v:0 r:1 m:50
G:  s:UN n:2/183f4 f:o t:UN d:EX/0 a:0 v:0 r:1 m:50
G:  s:UN n:3/3000f f:Io t:UN d:EX/0 a:0 v:0 r:1 m:50
G:  s:SH n:2/180e7 f:IqL t:SH d:EX/0 a:0 v:0 r:1 m:50
G:  s:SH n:3/20012 f:Io t:SH d:EX/0 a:0 v:0 r:2 m:10
  R: n:131090 f:30000000 b:65527/65527 i:1
G:  s:UN n:2/19 f:o t:UN d:EX/0 a:0 v:0 r:1 m:50
G:  s:SH n:2/183fa f:IqLo t:SH d:EX/0 a:0 v:0 r:2 m:50
  I: n:8/99322 t:8 f:0x00 d:0x00000000 s:2390
G:  s:SH n:5/17 f:Iqo t:SH d:EX/0 a:0 v:0 r:2 m:50
  H: s:SH f:EH e:0 p:5787 [(ended)] gfs2_inode_lookup+0x11e/0x2f0 [gfs2]
G:  s:UN n:3/70003 f:o t:UN d:EX/0 a:0 v:0 r:1 m:50
G:  s:EX n:2/183f9 f:yIqL t:EX d:EX/0 a:0 v:0 r:1 m:50
G:  s:SH n:5/18 f:Iqo t:SH d:EX/0 a:0 v:0 r:2 m:50
  H: s:SH f:EH e:0 p:5787 [(ended)] gfs2_inode_lookup+0x11e/0x2f0 [gfs2]
G:  s:EX n:2/30017 f:yIqL t:EX d:EX/0 a:0 v:0 r:1 m:50
G:  s:EX n:2/2001b f:yIqL t:EX d:EX/0 a:0 v:0 r:1 m:50
G:  s:EX n:2/30015 f:IqL t:EX d:EX/0 a:0 v:0 r:1 m:50
G:  s:SH n:2/18 f:Iqo t:SH d:EX/0 a:0 v:0 r:2 m:50
  I: n:3/24 t:4 f:0x00 d:0x00000201 s:3864
G:  s:UN n:1/3 f: t:UN d:EX/0 a:0 v:0 r:1 m:50
G:  s:UN n:2/805b f:o t:UN d:EX/0 a:0 v:0 r:1 m:50
G:  s:EX n:2/183f7 f:yIqL t:EX d:EX/0 a:0 v:0 r:1 m:50
G:  s:SH n:2/16 f:IqLo t:SH d:EX/0 a:0 v:0 r:2 m:50
  I: n:1/22 t:4 f:0x00 d:0x00000001 s:3864
G:  s:UN n:3/4000c f:o t:UN d:EX/0 a:0 v:0 r:1 m:50
G:  s:SH n:5/805b f:Iqo t:SH d:EX/0 a:0 v:0 r:2 m:50
  H: s:SH f:EH e:0 p:5787 [(ended)] gfs2_inode_lookup+0x11e/0x2f0 [gfs2]
G:  s:EX n:2/183f6 f:yIqL t:EX d:EX/0 a:0 v:0 r:1 m:50
G:  s:SH n:4/0 f:IqL t:SH d:EX/0 a:0 v:0 r:1 m:50
G:  s:SH n:5/182ef f:Iqo t:SH d:EX/0 a:0 v:0 r:2 m:50
  H: s:SH f:EH e:0 p:5787 [(ended)] gfs2_inode_lookup+0x11e/0x2f0 [gfs2]
G:  s:EX n:2/30016 f:yIqL t:EX d:EX/0 a:0 v:0 r:1 m:50
G:  s:SH n:2/17 f:IqLo t:SH d:EX/0 a:0 v:0 r:2 m:50
  I: n:2/23 t:4 f:0x00 d:0x00000201 s:3864
G:  s:SH n:1/1 f:Iq t:SH d:EX/0 a:0 v:0 r:2 m:50
  H: s:SH f:eEH e:0 p:5787 [(ended)] gfs2_glock_nq_num+0x59/0xa0 [gfs2]
G:  s:SH n:2/3001b f:IqLo t:SH d:EX/0 a:0 v:0 r:2 m:50
  I: n:8/196635 t:8 f:0x00 d:0x00000000 s:2566
G:  s:SH n:5/183f4 f:Iqo t:SH d:EX/0 a:0 v:0 r:2 m:50
  H: s:SH f:EH e:0 p:5787 [(ended)] gfs2_inode_lookup+0x11e/0x2f0 [gfs2]
G:  s:EX n:3/11 f:IqLo t:EX d:EX/0 a:0 v:0 r:2 m:50
  R: n:17 f:20000000 b:0/0 i:5
G:  s:UN n:3/60006 f:o t:UN d:EX/0 a:0 v:0 r:1 m:50
G:  s:SH n:2/183f3 f:Iqo t:SH d:EX/0 a:0 v:0 r:2 m:50
  I: n:19/99315 t:8 f:0x00 d:0x00000201 s:768

Kind regards,

Bart

Bart Verwilst schreef op 20.08.2012 14:24:
> At the same time, i notice a hanging /etc/libvirt/qemu gfs2 mount (
> while /var/lib/libvirt/sanlock still works fine ) on vm02. vm01 and
> vm03 have perfectly accessible mounts. Nothing special to see in
> syslog or dmesg..
>
> /dev/mapper/iscsi_cluster_qemu on /etc/libvirt/qemu type gfs2
> (rw,relatime,hostdata=jid=2)
> /dev/mapper/iscsi_cluster_sanlock on /var/lib/libvirt/sanlock type
> gfs2 (rw,relatime,hostdata=jid=2)
>
> Any ideas?
>
> Bart
>
> Bart Verwilst schreef op 20.08.2012 14:11:
>> Hello again ;)
>>
>> My cluster seems to be logging only to /var/log/syslog, and even 
>> then
>> only from the corosync daemon, the /var/log/cluster logs are empty:
>>
>> root at vm01-test:~# ls -al /var/log/cluster/*.log
>> -rw------- 1 root root 0 Aug 16 06:50 /var/log/cluster/corosync.log
>> -rw------- 1 root root 0 Aug 20 06:39 
>> /var/log/cluster/dlm_controld.log
>> -rw------- 1 root root 0 Aug 20 06:39 /var/log/cluster/fenced.log
>> -rw------- 1 root root 0 Aug  7 06:27 /var/log/cluster/fence_na.log
>> -rw------- 1 root root 0 Aug 16 06:50 
>> /var/log/cluster/gfs_controld.log
>> -rw------- 1 root root 0 Aug 20 06:39 /var/log/cluster/qdiskd.log
>> -rw------- 1 root root 0 Aug 20 06:39 /var/log/cluster/rgmanager.log
>>
>> Also, I've shut down my 2 vms with virt-manager or with halt from 
>> the
>> cli on the guest itself.
>> virsh list on all 3 nodes show no running guests. However:
>>
>> root at vm01-test:~# clustat
>> Cluster Status for kvm @ Mon Aug 20 14:10:20 2012
>> Member Status: Quorate
>>
>>  Member Name                                                     ID  
>> Status
>>  ------ ----                                                     
>> ---- ------
>>  vm01-test
>> 1 Online, Local, rgmanager
>>  vm02-test
>> 2 Online, rgmanager
>>  vm03-test
>> 3 Online, rgmanager
>>  /dev/mapper/iscsi_cluster_quorum
>> 0 Online, Quorum Disk
>>
>>  Service Name
>> Owner (Last)                                                     
>> State
>>  ------- ----
>> ----- ------                                                     
>> -----
>>  vm:intux_firewall
>> vm02-test
>> started
>>  vm:intux_zabbix
>> vm02-test
>> started
>>
>>
>> My config:
>>
>> <cluster name="kvm" config_version="14">
>> <logging debug="on"/>
>>         <clusternodes>
>>         <clusternode name="vm01-test" nodeid="1">
>> 	<fence>
>> 		<method name="apc">
>> 			<device name="apc01" port="1" action="off"/>
>> 			<device name="apc02" port="1" action="off"/>
>> 			<device name="apc01" port="1" action="on"/>
>> 			<device name="apc02" port="1" action="on"/>
>> 		</method>
>> 	</fence>
>>         </clusternode>
>>         <clusternode name="vm02-test" nodeid="2">
>> 	<fence>
>> 		<method name="apc">
>> 			<device name="apc01" port="8" action="off"/>
>> 			<device name="apc02" port="8" action="off"/>
>> 			<device name="apc01" port="8" action="on"/>
>> 			<device name="apc02" port="8" action="on"/>
>> 		</method>
>>                 </fence>
>>         </clusternode>
>>         <clusternode name="vm03-test" nodeid="3">
>> 	<fence>
>> 		<method name="apc">
>> 			<device name="apc01" port="2" action="off"/>
>> 			<device name="apc02" port="2" action="off"/>
>> 			<device name="apc01" port="2" action="on"/>
>> 			<device name="apc02" port="2" action="on"/>
>> 		</method>
>>                 </fence>
>>         </clusternode>
>>         </clusternodes>
>> <fencedevices>
>> 	<fencedevice agent="fence_apc" ipaddr="apc01" secure="on"
>> login="device" name="apc01" passwd="xxx"/>
>> 	<fencedevice agent="fence_apc" ipaddr="apc02" secure="on"
>> login="device" name="apc02" passwd="xxx"/>
>> </fencedevices>
>> <rm log_level="5">
>> 	<failoverdomains>
>> 		<failoverdomain name="any_node" nofailback="1" ordered="0"
>> restricted="0"/>
>> 	</failoverdomains>
>> 	<vm domain="any_node" max_restarts="2" migrate="live"
>> name="firewall" path="/etc/libvirt/qemu/" recovery="restart"
>> restart_expire_time="600"/>
>> 	<vm domain="any_node" max_restarts="2" migrate="live" name="zabbix"
>> path="/etc/libvirt/qemu/" recovery="restart"
>> restart_expire_time="600"/>
>> </rm>
>> <totem rrp_mode="none" secauth="off"/>
>> <quorumd interval="2" tko="4"
>> device="/dev/mapper/iscsi_cluster_quorum"></quorumd>
>> </cluster>
>>
>> I hope you guys can shed some light on this.. CMAN, rgmanager, ... =
>> 3.1.7-0ubuntu2.1, corosync = 1.4.2-2
>>
>> Kind regards,
>>
>> Bart




More information about the Linux-cluster mailing list