[Cluster-devel] cluster/cman Makefile init.d/Makefile man/Make ...

lhh at sourceware.org lhh at sourceware.org
Fri Jul 21 18:01:40 UTC 2006


CVSROOT:	/cvs/cluster
Module name:	cluster
Branch: 	STABLE
Changes by:	lhh at sourceware.org	2006-07-21 18:01:38

Modified files:
	cman           : Makefile 
	cman/init.d    : Makefile 
	cman/man       : Makefile 
Added files:
	cman/init.d    : qdiskd 
	cman/man       : mkqdisk.8 qdisk.5 qdiskd.8 
	cman/qdisk     : Makefile README bitmap.c clulog.c clulog.h 
	                 crc32.c disk.c disk.h disk_util.c gettid.c 
	                 gettid.h main.c mkqdisk.c platform.h proc.c 
	                 score.c score.h 

Log message:
	Merge from RHEL4 branch; add QDisk

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/Makefile.diff?cvsroot=cluster&only_with_tag=STABLE&r1=1.4.8.1&r2=1.4.8.2
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/init.d/qdiskd.diff?cvsroot=cluster&only_with_tag=STABLE&r1=NONE&r2=1.2.2.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/init.d/Makefile.diff?cvsroot=cluster&only_with_tag=STABLE&r1=1.1&r2=1.1.8.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/man/mkqdisk.8.diff?cvsroot=cluster&only_with_tag=STABLE&r1=NONE&r2=1.2.4.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/man/qdisk.5.diff?cvsroot=cluster&only_with_tag=STABLE&r1=NONE&r2=1.2.4.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/man/qdiskd.8.diff?cvsroot=cluster&only_with_tag=STABLE&r1=NONE&r2=1.2.4.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/man/Makefile.diff?cvsroot=cluster&only_with_tag=STABLE&r1=1.1&r2=1.1.8.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/qdisk/Makefile.diff?cvsroot=cluster&only_with_tag=STABLE&r1=NONE&r2=1.5.2.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/qdisk/README.diff?cvsroot=cluster&only_with_tag=STABLE&r1=NONE&r2=1.4.2.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/qdisk/bitmap.c.diff?cvsroot=cluster&only_with_tag=STABLE&r1=NONE&r2=1.2.2.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/qdisk/clulog.c.diff?cvsroot=cluster&only_with_tag=STABLE&r1=NONE&r2=1.2.2.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/qdisk/clulog.h.diff?cvsroot=cluster&only_with_tag=STABLE&r1=NONE&r2=1.2.2.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/qdisk/crc32.c.diff?cvsroot=cluster&only_with_tag=STABLE&r1=NONE&r2=1.2.2.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/qdisk/disk.c.diff?cvsroot=cluster&only_with_tag=STABLE&r1=NONE&r2=1.4.2.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/qdisk/disk.h.diff?cvsroot=cluster&only_with_tag=STABLE&r1=NONE&r2=1.3.2.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/qdisk/disk_util.c.diff?cvsroot=cluster&only_with_tag=STABLE&r1=NONE&r2=1.2.2.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/qdisk/gettid.c.diff?cvsroot=cluster&only_with_tag=STABLE&r1=NONE&r2=1.4.2.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/qdisk/gettid.h.diff?cvsroot=cluster&only_with_tag=STABLE&r1=NONE&r2=1.2.2.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/qdisk/main.c.diff?cvsroot=cluster&only_with_tag=STABLE&r1=NONE&r2=1.3.2.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/qdisk/mkqdisk.c.diff?cvsroot=cluster&only_with_tag=STABLE&r1=NONE&r2=1.3.2.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/qdisk/platform.h.diff?cvsroot=cluster&only_with_tag=STABLE&r1=NONE&r2=1.2.2.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/qdisk/proc.c.diff?cvsroot=cluster&only_with_tag=STABLE&r1=NONE&r2=1.2.2.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/qdisk/score.c.diff?cvsroot=cluster&only_with_tag=STABLE&r1=NONE&r2=1.2.2.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/qdisk/score.h.diff?cvsroot=cluster&only_with_tag=STABLE&r1=NONE&r2=1.2.2.1

--- cluster/cman/Makefile	2005/07/05 16:01:29	1.4.8.1
+++ cluster/cman/Makefile	2006/07/21 18:01:37	1.4.8.2
@@ -14,14 +14,17 @@
 all:
 	cd cman_tool && ${MAKE} all
 	cd lib && ${MAKE} all
+	cd qdisk && ${MAKE} all
 
 copytobin:
 	cd cman_tool && ${MAKE} copytobin
+	cd qdisk && ${MAKE} copytobin
 	cd lib && ${MAKE} copytobin
 
 clean:
 	cd bin && ${MAKE} clean
 	cd cman_tool && ${MAKE} clean
+	cd qdisk && ${MAKE} clean
 	cd lib && ${MAKE} clean
 
 distclean: clean
@@ -31,10 +34,12 @@
 	cd man && ${MAKE} install
 	cd cman_tool && ${MAKE} install
 	cd lib && ${MAKE} install
+	cd qdisk && ${MAKE} install
 	cd init.d && ${MAKE} install
 
 uninstall:
 	cd cman_tool && ${MAKE} uninstall
 	cd lib && ${MAKE} uninstall
 	cd man && ${MAKE} uninstall
+	cd qdisk && ${MAKE} uninstall
 	cd init.d && ${MAKE} uninstall
/cvs/cluster/cluster/cman/init.d/qdiskd,v  -->  standard output
revision 1.2.2.1
--- cluster/cman/init.d/qdiskd
+++ -	2006-07-21 18:01:38.720108000 +0000
@@ -0,0 +1,63 @@
+#!/bin/bash
+#
+# chkconfig: 345 22 78
+# description: Starts and stops the quroum disk daemon
+#
+#	       
+### BEGIN INIT INFO
+# Provides: 
+### END INIT INFO
+
+. /etc/init.d/functions
+[ -f /etc/sysconfig/cluster ] && . /etc/sysconfig/cluster
+
+LOCK_FILE="/var/lock/subsys/qdiskd"
+
+rtrn=1
+retries=0
+
+# See how we were called.
+case "$1" in
+  start)
+	action "Starting the Quorum Disk Daemon:" qdiskd
+	rtrn=$?
+	[ $rtrn = 0 ] && touch $LOCK_FILE
+	;;
+
+  stop)
+	echo -n "Stopping the Quorum Disk Daemon:"
+	killproc qdiskd
+	while [ -n "`pidof qdiskd`" ] && [ $retries -lt 5 ]; do
+		sleep 1
+		killproc qdiskd
+		((retries++))
+	done
+	if [ -z "`pidof qdiskd`" ]; then
+		echo_success
+		echo
+		rtrn=0
+		rm -f $LOCK_FILE
+	else
+		echo_failure
+		echo
+		rtrn=1
+	fi
+	;;
+
+  restart)
+	$0 stop || exit $?
+	$0 start 
+	rtrn=$?
+	;;
+
+  status)
+	status qdiskd
+	rtrn=$?
+	;;
+
+  *)
+	echo $"Usage: $0 {start|stop|restart|status}"
+	;;
+esac
+
+exit $rtrn
--- cluster/cman/init.d/Makefile	2004/12/17 20:07:59	1.1
+++ cluster/cman/init.d/Makefile	2006/07/21 18:01:38	1.1.8.1
@@ -10,7 +10,7 @@
 ###############################################################################
 ###############################################################################
 
-TARGET= cman
+TARGET= cman qdiskd
 
 UNINSTALL=${top_srcdir}/scripts/uninstall.pl
 
/cvs/cluster/cluster/cman/man/mkqdisk.8,v  -->  standard output
revision 1.2.4.1
--- cluster/cman/man/mkqdisk.8
+++ -	2006-07-21 18:01:38.882836000 +0000
@@ -0,0 +1,23 @@
+.TH "mkqdisk" "8" "July 2006" "" "Quorum Disk Management"
+.SH "NAME"
+mkqdisk \- Cluster Quorum Disk Utility
+.SH "WARNING"
+Use of this command can cause the cluster to malfunction.
+.SH "SYNOPSIS"
+\fBmkqdisk [\-?|\-h] | [\-L] | [\-f \fPlabel\fB] [\-c \fPdevice \fB -l \fPlabel\fB]
+.SH "DESCRIPTION"
+.PP 
+The \fBmkqdisk\fP command is used to create a new quorum disk or display
+existing quorum disks accessible from a given cluster node.
+.SH "OPTIONS"
+.IP "\-c device \-l label"
+Initialize a new cluster quorum disk.  This will destroy all data on the given
+device.  If a cluster is currently using that device as a quorum disk, the
+entire cluster will malfunction.  Do not ru
+.IP "\-f label"
+Find the cluster quorum disk with the given label and display information about it..
+.IP "\-L"
+Display information on all accessible cluster quorum disks.
+
+.SH "SEE ALSO"
+qdisk(5) qdiskd(8)
/cvs/cluster/cluster/cman/man/qdisk.5,v  -->  standard output
revision 1.2.4.1
--- cluster/cman/man/qdisk.5
+++ -	2006-07-21 18:01:38.970862000 +0000
@@ -0,0 +1,309 @@
+.TH "QDisk" "8" "July 2006" "" "Cluster Quorum Disk"
+.SH "NAME"
+QDisk 1.0 \- a disk-based quorum daemon for CMAN / Linux-Cluster
+.SH "1. Overview"
+.SH "1.1 Problem"
+In some situations, it may be necessary or desirable to sustain
+a majority node failure of a cluster without introducing the need for
+asymmetric cluster configurations (e.g. client-server, or heavily-weighted
+voting nodes).
+
+.SH "1.2. Design Requirements"
+* Ability to sustain 1..(n-1)/n simultaneous node failures, without the
+danger of a simple network partition causing a split brain.  That is, we
+need to be able to ensure that the majority failure case is not merely
+the result of a network partition.
+
+* Ability to use external reasons for deciding which partition is the 
+the quorate partition in a partitioned cluster.  For example, a user may
+have a service running on one node, and that node must always be the master
+in the event of a network partition.  Or, a node might lose all network
+connectivity except the cluster communication path - in which case, a
+user may wish that node to be evicted from the cluster.
+
+* Integration with CMAN.  We must not require CMAN to run with us (or
+without us).  Linux-Cluster does not require a quorum disk normally -
+introducing new requirements on the base of how Linux-Cluster operates
+is not allowed.
+
+* Data integrity.  In order to recover from a majority failure, fencing
+is required.  The fencing subsystem is already provided by Linux-Cluster.
+
+* Non-reliance on hardware or protocol specific methods (i.e. SCSI
+reservations).  This ensures the quorum disk algorithm can be used on the
+widest range of hardware configurations possible.
+
+* Little or no memory allocation after initialization.  In critical paths
+during failover, we do not want to have to worry about being killed during
+a memory pressure situation because we request a page fault, and the Linux
+OOM killer responds...
+
+.SH "1.3. Hardware Considerations and Requirements"
+.SH "1.3.1. Concurrent, Synchronous, Read/Write Access"
+This quorum daemon requires a shared block device with concurrent read/write
+access from all nodes in the cluster.  The shared block device can be
+a multi-port SCSI RAID array, a Fiber-Channel RAID SAN, a RAIDed iSCSI
+target, or even GNBD.  The quorum daemon uses O_DIRECT to write to the
+device.
+
+.SH "1.3.2. Bargain-basement JBODs need not apply"
+There is a minimum performance requirement inherent when using disk-based
+cluster quorum algorithms, so design your cluster accordingly.  Using a
+cheap JBOD with old SCSI2 disks on a multi-initiator bus will cause 
+problems at the first load spike.  Plan your loads accordingly; a node's
+inability to write to the quorum disk in a timely manner will cause the
+cluster to evict the node.  Using host-RAID or multi-initiator parallel
+SCSI configurations with the qdisk daemon is unlikely to work, and will
+probably cause administrators a lot of frustration.  That having been
+said, because the timeouts are configurable, most hardware should work
+if the timeouts are set high enough.
+
+.SH "1.3.3. Fencing is Required"
+In order to maintain data integrity under all failure scenarios, use of
+this quorum daemon requires adequate fencing, preferrably power-based
+fencing.  Watchdog timers and software-based solutions to reboot the node
+internally, while possibly sufficient, are not considered 'fencing' for 
+the purposes of using the quorum disk.
+
+.SH "1.4. Limitations"
+* At this time, this daemon supports a maximum of 16 nodes.  This is
+primarily a scalability issue: As we increase the node count, we increase
+the amount of synchronous I/O contention on the shared quorum disk.
+
+* Cluster node IDs must be statically configured in cluster.conf and
+must be numbered from 1..16 (there can be gaps, of course).
+
+* Cluster node votes should be more or less equal.
+
+* CMAN must be running before the qdisk program can start.
+
+* CMAN's eviction timeout should be at least 2x the quorum daemon's
+to give the quorum daemon adequate time to converge on a master during a
+failure + load spike situation.
+
+* The total number of votes assigned to the quorum device should be
+equal to or greater than the total number of node-votes in the cluster.
+While it is possible to assign only one (or a few) votes to the quorum
+device, the effects of doing so have not been explored.
+
+* Currently, the quorum disk daemon is difficult to use with CLVM if
+the quorum disk resides on a CLVM logical volume.  CLVM requires a
+quorate cluster to correctly operate, which introduces a chicken-and-egg
+problem for starting the cluster: CLVM needs quorum, but the quorum daemon
+needs CLVM (if and only if the quorum device lies on CLVM-managed storage).
+One way to work around this is to *not* set the cluster's expected votes
+to include the quorum daemon's votes.  Bring all nodes online, and start
+the quorum daemon *after* the whole cluster is running.  This will allow
+the expected votes to increase naturally.
+
+.SH "2. Algorithms"
+.SH "2.1. Heartbeating & Liveliness Determination"
+Nodes update individual status blocks on the quorum disk at a user-
+defined rate.  Each write of a status block alters the timestamp, which
+is what other nodes use to decide whether a node has hung or not.  If,
+after a user-defined number of 'misses' (that is, failure to update a
+timestamp), a node is declared offline.  After a certain number of 'hits'
+(changed timestamp + "i am alive" state), the node is declared online.
+
+The status block contains additional information, such as a bitmask of
+the nodes that node believes are online.  Some of this information is
+used by the master - while some is just for performace recording, and
+may be used at a later time.  The most important pieces of information
+a node writes to its status block are:
+
+.in 12
+- Timestamp
+.br
+- Internal state (available / not available)
+.br
+- Score
+.br
+- Known max score (may be used in the future to detect invalid configurations)
+.br
+- Vote/bid messages
+.br
+- Other nodes it thinks are online
+.in 0
+
+.SH "2.2. Scoring & Heuristics"
+The administrator can configure up to 10 purely arbitrary heuristics, and
+must exercise caution in doing so.  At least one administrator-
+defined heuristic is required for operation, but it is generally a good
+idea to have more than one heuristic.  By default, only nodes scoring over
+1/2 of the total maximum score will claim they are available via the
+quorum disk, and a node (master or otherwise) whose score drops too low
+will remove itself (usually, by rebooting).
+
+The heuristics themselves can be any command executable by 'sh -c'.  For
+example, in early testing the following was used:
+
+.ti 12
+<\fBheuristic \fP\fIprogram\fP\fB="\fP[ -f /quorum ]\fB" \fP\fIscore\fP\fB="\fP10\fB" \fP\fIinterval\fP\fB="\fP2\fB"/>\fP
+
+This is a literal sh-ism which tests for the existence of a file called
+"/quorum".  Without that file, the node would claim it was unavailable.
+This is an awful example, and should never, ever be used in production,
+but is provided as an example as to what one could do...
+
+Typically, the heuristics should be snippets of shell code or commands which
+help determine a node's usefulness to the cluster or clients.  Ideally, you
+want to add traces for all of your network paths (e.g. check links, or
+ping routers), and methods to detect availability of shared storage.
+
+.SH "2.3. Master Election"
+Only one master is present at any one time in the cluster, regardless of
+how many partitions exist within the cluster itself.  The master is
+elected by a simple voting scheme in which the lowest node which believes
+it is capable of running (i.e. scores high enough) bids for master status.
+If the other nodes agree, it becomes the master.  This algorithm is 
+run whenever no master is present.
+
+If another node comes online with a lower node ID while a node is still
+bidding for master status, it will rescind its bid and vote for the lower
+node ID.  If a master dies or a bidding node dies, the voting algorithm
+is started over.  The voting algorithm typically takes two passes to
+complete.
+
+Master deaths take marginally longer to recover from than non-master
+deaths, because a new master must be elected before the old master can
+be evicted & fenced.
+
+.SH "2.4. Master Duties"
+The master node decides who is or is not in the master partition, as
+well as handles eviction of dead nodes (both via the quorum disk and via
+the linux-cluster fencing system by using the cman_kill_node() API).
+
+.SH "2.5. How it All Ties Together"
+When a master is present, and if the master believes a node to be online,
+that node will advertise to CMAN that the quorum disk is available.  The
+master will only grant a node membership if:
+
+.in 12
+(a) CMAN believes the node to be online, and
+.br
+(b) that node has made enough consecutive, timely writes
+.in 16
+to the quorum disk, and
+.in 12
+(c) the node has a high enough score to consider itself online.
+.in 0
+
+.SH "3. Configuration"
+.SH "3.1. The <quorumd> tag"
+This tag is a child of the top-level <cluster> tag.
+
+.in 8
+\fB<quorumd\fP
+.in 9
+\fIinterval\fP\fB="\fP1\fB"\fP
+.in 12 
+This is the frequency of read/write cycles
+
+.in 9
+\fItko\fP\fB="\fP10\fB"\fP
+.in 12
+This is the number of cycles a node must miss in order to be declared dead.
+
+.in 9
+\fIvotes\fP\fB="\fP3\fB"\fP
+.in 12
+This is the number of votes the quorum daemon advertises to CMAN when it
+has a high enough score.
+
+.in 9
+\fIlog_level\fP\fB="\fP4\fB"\fP
+.in 12
+This controls the verbosity of the quorum daemon in the system logs.
+0 = emergencies; 7 = debug.
+
+.in 9
+\fIlog_facility\fP\fB="\fPlocal4\fB"\fP
+.in 12
+This controls the syslog facility used by the quorum daemon when logging.
+For a complete list of available facilities, see \fBsyslog.conf(5)\fP.
+
+.in 9
+\fIstatus_file\fP\fB="\fP/foo\fB"\fP
+.in 12
+Write internal states out to this file periodically ("-" = use stdout).
+This is primarily used for debugging.
+
+.in 9
+\fImin_score\fP\fB="\fP3\fB"\fP
+.in 12
+Absolute minimum score to be consider one's self "alive".  If omitted,
+or set to 0, the default function "floor((n+1)/2)" is used, where \fIn\fP
+is the sum-total of all of defined heuristics' \fIscore\fP attribute.
+
+.in 9
+\fIdevice\fP\fB="\fP/dev/sda1\fB"\fP
+.in 12
+This is the device the quorum daemon will use.  This device must be the
+same on all nodes.
+
+.in 9
+\fIlabel\fP\fB="\fPmylabel\fB"/>\fP
+.in 12
+This overrides the device field if present.  If specified, the quorum
+daemon will read /proc/partitions and check for qdisk signatures
+on every block device found, comparing the label against the specified
+label.  This is useful in configurations where the block device name
+differs on a per-node basis.
+.in 0
+
+.SH "3.2.  The <heuristic> tag"
+This tag is a child of the <quorumd> tag.
+
+.in 8
+\fB<heuristic\fP
+.in 9
+\fIprogram\fP\fB="\fP/test.sh\fB"\fP
+.in 12
+This is the program used to determine if this heuristic is alive.  This
+can be anything which may be executed by \fI/bin/sh -c\fP.  A return
+value of zero indicates success; anything else indicates failure.
+
+.in 9
+\fIscore\fP\fB="\fP1\fB"\fP
+.in 12
+This is the weight of this heuristic.  Be careful when determining scores
+for heuristics.
+
+.in 9
+\fIinterval\fP\fB="\fP2\fB"/>\fP
+.in 12
+This is the frequency at which we poll the heuristic.
+.in 0
+
+.SH "3.3. Example"
+.in 8
+<quorumd interval="1" tko="10" votes="3" label="testing">
+.in 12
+<heuristic program="ping A -c1 -t1" score="1" interval="2"/>
+.br
+<heuristic program="ping B -c1 -t1" score="1" interval="2"/>
+.br
+<heuristic program="ping C -c1 -t1" score="1" interval="2"/>
+.br
+.in 8
+</quorumd>
+.in 0
+
+.SH "3.4. Heuristic score considerations"
+* Heuristic timeouts should be set high enough to allow the previous run
+of a given heuristic to complete.
+
+* Heuristic scripts returning anything except 0 as their return code 
+are considered failed.
+
+* The worst-case for improperly configured quorum heuristics is a race
+to fence where two partitions simultaneously try to kill each other.
+
+.SH "3.5. Creating a quorum disk partition"
+The mkqdisk utility can create and list currently configured quorum disks
+visible to the local node; see
+.B mkqdisk(8)
+for more details.
+
+.SH "SEE ALSO"
+mkqdisk(8), qdiskd(8), cman(5), syslog.conf(5)
/cvs/cluster/cluster/cman/man/qdiskd.8,v  -->  standard output
revision 1.2.4.1
--- cluster/cman/man/qdiskd.8
+++ -	2006-07-21 18:01:39.053646000 +0000
@@ -0,0 +1,20 @@
+.TH "qdiskd" "8" "July 2006" "" "Quorum Disk Management"
+.SH "NAME"
+qdiskd \- Cluster Quorum Disk Daemon
+.SH "SYNOPSIS"
+\fBqdiskd [\-f] [\-d]
+.SH "DESCRIPTION"
+.PP 
+The \fBqdiskd\fP daemon talks to CMAN and provides a mechanism for determining
+node-fitness in a cluster environment.  See
+.B
+qdisk(5)
+for configuration information.
+.SH "OPTIONS"
+.IP "\-f"
+Run in the foreground (do not fork / daemonize).
+.IP "\-d"
+Enable debug output.
+
+.SH "SEE ALSO"
+mkqdisk(8), qdisk(5), cman(5)
--- cluster/cman/man/Makefile	2004/08/13 06:38:22	1.1
+++ cluster/cman/man/Makefile	2006/07/21 18:01:38	1.1.8.1
@@ -18,10 +18,10 @@
 install:
 	install -d ${mandir}/man5
 	install -d ${mandir}/man8
-	install cman.5 ${mandir}/man5
-	install cman_tool.8 ${mandir}/man8
+	install cman.5 qdisk.5 ${mandir}/man5
+	install cman_tool.8 qdiskd.8 mkqdisk.8 ${mandir}/man8
 
 uninstall:
-	${UNINSTALL} cman.5 ${mandir}/man5
-	${UNINSTALL} cman_tool.8 ${mandir}/man8
+	${UNINSTALL} cman.5 qdisk.5 ${mandir}/man5
+	${UNINSTALL} cman_tool.8 qdiskd.8 mkqdisk.8 ${mandir}/man8
 
/cvs/cluster/cluster/cman/qdisk/Makefile,v  -->  standard output
revision 1.5.2.1
--- cluster/cman/qdisk/Makefile
+++ -	2006-07-21 18:01:39.244798000 +0000
@@ -0,0 +1,49 @@
+###############################################################################
+###############################################################################
+##
+##  Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+##
+##  This copyrighted material is made available to anyone wishing to use,
+##  modify, copy, or redistribute it subject to the terms and conditions
+##  of the GNU General Public License v.2.
+##
+###############################################################################
+###############################################################################
+
+top_srcdir=..
+UNINSTALL=${top_srcdir}/scripts/uninstall.pl
+
+include ${top_srcdir}/make/defines.mk
+
+INCLUDES+=-I. -I../lib
+CFLAGS +=-I${incdir} -I${top_srcdir}/config \
+         -Wall -Werror -Wstrict-prototypes -Wshadow -D_GNU_SOURCE -g
+
+TARGET=qdiskd mkqdisk
+
+all: ${TARGET}
+
+copytobin: all
+	cp ${TARGET} ${top_srcdir}/bin
+
+install: ${TARGET}
+	install -d ${sbindir}
+	install ${TARGET} ${sbindir}
+
+qdiskd: disk.o crc32.o disk_util.o main.o score.o bitmap.o clulog.o \
+	gettid.o proc.o ../lib/libcman.a
+	gcc -o $@ $^ -lpthread -L../lib -lccs
+
+mkqdisk: disk.o crc32.o disk_util.o \
+	 proc.o mkqdisk.o
+	gcc -o $@ $^ 
+
+
+%.o: %.c
+	$(CC) -c -o $@ $^ $(INCLUDES) $(CFLAGS)
+
+clean:
+	rm -f *.o ${TARGET}
+
+uninstall:
+	${UNINSTALL} ${TARGET} ${sbindir}
/cvs/cluster/cluster/cman/qdisk/README,v  -->  standard output
revision 1.4.2.1
--- cluster/cman/qdisk/README
+++ -	2006-07-21 18:01:39.324979000 +0000
@@ -0,0 +1,274 @@
+qdisk 1.0 - a disk-based quorum algorithm for Linux-Cluster
+
+(C) 2006 Red Hat, Inc.
+
+1. Overview
+
+1.1. Problem
+
+In some situations, it may be necessary or desirable to sustain
+a majority node failure of a cluster without introducing the need for
+asymmetric (client-server, or heavy-weighted voting nodes).
+
+1.2. Design Requirements
+
+* Ability to sustain 1..(n-1)/n simultaneous node failures, without the
+danger of a simple network partition causing a split brain.  That is, we
+need to be able to ensure that the majority failure case is not merely
+the result of a network partition.
+
+* Ability to use external reasons for deciding which partition is the 
+the quorate partition in a partitioned cluster.  For example, a user may
+have a service running on one node, and that node must always be the master
+in the event of a network partition.  Or, a node might lose all network
+connectivity except the cluster communication path - in which case, a
+user may wish that node to be evicted from the cluster.
+
+* Integration with CMAN.  We must not require CMAN to run with us (or
+without us).  Linux-Cluster does not require a quorum disk normally -
+introducing new requirements on the base of how Linux-Cluster operates
+is not allowed.
+
+* Data integrity.  In order to recover from a majority failure, fencing
+is required.  The fencing subsystem is already provided by Linux-Cluster.
+
+* Non-reliance on hardware or protocol specific methods (i.e. SCSI
+reservations).  This ensures the quorum disk algorithm can be used on the
+widest range of hardware configurations possible.
+
+* Little or no memory allocation after initialization.  In critical paths
+during failover, we do not want to have to worry about being killed during
+a memory pressure situation because we request a page fault, and the Linux
+OOM killer responds...
+
+
+1.3. Hardware Configuration Considerations
+
+1.3.1. Concurrent, Synchronous, Read/Write Access
+
+This daemon requires a shared block device with concurrent read/write
+access from all nodes in the cluster.  The shared block device can be
+a multi-port SCSI RAID array, a Fiber-Channel RAID SAN, a RAIDed iSCSI
+target, or even GNBD.  The quorum daemon uses O_DIRECT to write to the
+device.
+
+1.3.2. Bargain-basement JBODs need not apply
+
+There is a minimum performance requirement inherent when using disk-based
+cluster quorum algorithms, so design your cluster accordingly.  Using a
+cheap JBOD with old SCSI2 disks on a multi-initiator bus will cause 
+problems at the first load spike.  Plan your loads accordingly; a node's
+inability to write to the quorum disk in a timely manner will cause the
+cluster to evict the node.  Using host-RAID or multi-initiator parallel
+SCSI configurations with the qdisk daemon is unlikely to work, and will
+probably cause administrators a lot of frustration.  That having been
+said, because the timeouts are configurable, most hardware should work
+if the timeouts are set high enough.
+
+1.3.3. Fencing is Required
+
+In order to maintain data integrity under all failure scenarios, use of
+this quorum daemon requires adequate fencing, preferrably power-based
+fencing.
+
+
+1.4. Limitations
+
+* At this time, this daemon only supports a maximum of 16 nodes.
+
+* Cluster node IDs must be statically configured in cluster.conf and
+must be numbered from 1..16 (there can be gaps, of course).
+
+* Cluster node votes should be more or less equal.
+
+* CMAN must be running before the qdisk program can start.  This
+limitation will be removed before a production release.
+
+* CMAN's eviction timeout should be at least 2x the quorum daemon's
+to give the quorum daemon adequate time to converge on a master during a
+failure + load spike situation.
+
+* The total number of votes assigned to the quorum device should be
+equal to or greater than the total number of node-votes in the cluster.
+While it is possible to assign only one (or a few) votes to the quorum
+device, the effects of doing so have not been explored.
+
+* Currently, the quorum disk daemon is difficult to use with CLVM if
+the quorum disk resides on a CLVM logical volume.  CLVM requires a
+quorate cluster to correctly operate, which introduces a chicken-and-egg
+problem for starting the cluster: CLVM needs quorum, but the quorum daemon
+needs CLVM (if and only if the quorum device lies on CLVM-managed storage).
+One way to work around this is to *not* set the cluster's expected votes
+to include theh quorum daemon's votes.  Bring all nodes online, and start
+the quorum daemon *after* the whole cluster is running.  This will allow
+the expected votes to increase naturally.
+
+
+2. Algorithms
+
+2.1. Heartbeating & Liveliness Determination
+
+Nodes update individual status blocks on the quorum disk at a user-
+defined rate.  Each write of a status block alters the timestamp, which
+is what other nodes use to decide whether a node has hung or not.  If,
+after a user-defined number of 'misses' (that is, failure to update a
+timestamp), a node is declared offline.  After a certain number of 'hits'
+(changed timestamp + "i am alive" state), the node is declared online.
+
+The status block contains additional information, such as a bitmask of
+the nodes that node believes are online.  Some of this information is
+used by the master - while some is just for performace recording, and
+may be used at a later time.  The most important pieces of information
+a node writes to its status block are:
+
+  - timestamp
+  - internal state (available / not available)
+  - score
+  - max score
+  - vote/bid messages
+  - other nodes it thinks are online
+
+
+2.2. Scoring & Heuristics
+
+The administrator can configure up to 10 purely arbitrary heuristics, and
+must exercise caution in doing so.  By default, only nodes scoring over
+1/2 of the total maximum score will claim they are available via the
+quorum disk, and a node (master or otherwise) whose score drops too low
+will remove itself (usually, by rebooting).
+
+The heuristics themselves can be any command executable by 'sh -c'.  For
+example, in early testing, I used this:
+
+    <heuristic program="[ -f /quorum ]" score="10" interval="2"/>
+
+This is a literal sh-ism which tests for the existence of a file called
+"/quorum".  Without that file, the node would claim it was unavailable.
+This is an awful example, and should never, ever be used in production,
+but is provided as an example as to what one could do...
+
+Typically, the heuristics should be snippets of shell code or commands which
+help determine a node's usefulness to the cluster or clients.  Ideally, you
+want to add traces for all of your network paths (e.g. check links, or
+ping routers), and methods to detect availability of shared storage.
+
+
+2.3. Master Election
+
+Only one master is present at any one time in the cluster, regardless of
+how many partitions exist within the cluster itself.  The master is
+elected by a simple voting scheme in which the lowest node which believes
+it is capable of running (i.e. scores high enough) bids for master status.
+If the other nodes agree, it becomes the master.  This algorithm is 
+run whenever no master is present.
+
+If another node comes online with a lower node ID while a node is still
+bidding for master status, it will rescind its bid and vote for the lower
+node ID.  If a master dies or a bidding node dies, the voting algorithm
+is started over.  The voting algorithm typically takes two passes to
+complete.
+
+Master deaths take marginally longer to recover from than non-master
+deaths, because a new master must be elected before the old master can
+be evicted & fenced.
+
+
+2.4. Master Duties
+
+The master node decides who is or is not in the master partition, as
+well as handles eviction of dead nodes (both via the quorum disk and via
+the linux-cluster fencing system by using the cman_kill_node() API).
+
+
+2.5. How it All Ties Together
+
+When a master is present, and if the master believes a node to be online,
+that node will advertise to CMAN that the quorum disk is avilable.  The
+master will only grant a node membership if:
+
+   (a) CMAN believes the node to be online, and
+   (b) that node has made enough consecutive, timely writes to the quorum
+       disk.
+
+
+3. Configuration
+
+3.1. The <quorumd> tag
+
+This tag is a child of the top-level <cluster> tag.
+
+   <quorumd
+    interval="1"          This is the frequency of read/write cycles
+    tko="10"              This is the number of cycles a node must miss
+                          in order to be declared dead.
+    votes="3"             This is the number of votes the quorum daemon
+                          advertises to CMAN when it has a high enough
+                          score.
+    log_level="4"         This controls the verbosity of the quorum daemon
+                          in the system logs. 0 = emergencies; 7 = debug
+    log_facility="local4" This controls the syslog facility used by the
+			  quorum daemon when logging.
+    status_file="/foo"    Write internal states out to this file
+			  periodically ("-" = use stdout).
+    min_score="3"	  Absolute minimum score to be consider one's
+			  self "alive".  If omitted, or set to 0, the
+			  default function "floor((n+1)/2)" is used.
+    device="/dev/sda1"    This is the device the quorum daemon will use.
+			  This device must be the same on all nodes.
+    label="mylabel"/>     This overrides the device field if present.
+			  If specified, the quorum daemon will read
+			  /proc/partitions and check for qdisk signatures
+			  on every block device found, comparing the label
+			  against the specified label.  This is useful in
+			  configurations where the block device name
+			  differs on a per-node basis.
+
+
+3.2.  The <heuristic> tag
+
+This tag is a child of the <quorumd> tag.
+
+   <heuristic
+    program="/test.sh"    This is the program used to determine if this
+                          heuristic is alive.  This can be anything which
+                          may be executed by "/bin/sh -c".  A return value
+                          of zero indicates success.
+    score="1"             This is the weight of this heuristic.  Be careful
+                          when determining scores for heuristics.
+    interval="2"/>        This is the frequency at which we poll the
+                          heuristic.
+
+3.3. Example
+
+  <quorumd interval="1" tko="10" votes="3" device="/dev/gnbd/qdisk">
+    <heuristic program="ping routerA -c1 -t1" score="1" interval="2"/>
+    <heuristic program="ping routerB -c1 -t1" score="1" interval="2"/>
+    <heuristic program="ping routerC -c1 -t1" score="1" interval="2"/>
+  </quorumd>
+
+3.4. Heuristic score considerations
+
+* Heuristic timeouts should be set high enough to allow the previous run
+of a given heuristic to complete.
+
+* Heuristic scripts returning anything except 0 as their return code 
+are considered failed.
+
+* The worst-case for improperly configured quorum heuristics is a race
+to fence where two partitions simultaneously try to kill each other.
+
+3.5. Creating a quorum disk partition
+
+3.5.1. The mkqdisk utility.
+
+The mkqdisk utility can create and list currently configured quorum disks
+visible to the local node.
+
+  mkqdisk -L		List available quorum disks.
+
+  mkqdisk -f <label>	Find a quorum device by the given label.
+
+  mkqdisk -c <device> -l <label>
+			Initialize <device> and name it <label>.  This
+			will destroy all data on the device, so be careful
+			when running this command.
/cvs/cluster/cluster/cman/qdisk/bitmap.c,v  -->  standard output
revision 1.2.2.1
--- cluster/cman/qdisk/bitmap.c
+++ -	2006-07-21 18:01:39.411387000 +0000
@@ -0,0 +1,107 @@
+/*
+  Copyright Red Hat, Inc. 2002-2003, 2006
+
+  The Red Hat Cluster Manager API Library is free software; you can
+  redistribute it and/or modify it under the terms of the GNU Lesser
+  General Public License as published by the Free Software Foundation;
+  either version 2.1 of the License, or (at your option) any later
+  version.
+
+  The Red Hat Cluster Manager API Library is distributed in the hope
+  that it will be useful, but WITHOUT ANY WARRANTY; without even the
+  implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+  PURPOSE.  See the GNU Lesser General Public License for more details.
+
+  You should have received a copy of the GNU Lesser General Public
+  License along with this library; if not, write to the Free Software
+  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+  USA.
+ */
+/** @file
+ * Bitmap and membership mask handling routines.
+ */
+#include <stdint.h>
+
+
+/**
+ * Clear a bit in a bitmap / bitmask.
+ *
+ * @param mask		Bitmask to modify.
+ * @param bitidx	Bit to modify.
+ * @param masklen	Bitmask length (in uint8_t units)
+ * @return		-1 if the index exceeds the number of bits in the
+ *			bitmap, otherwise 0.
+ */
+int
+clear_bit(uint8_t *mask, uint32_t bitidx, uint32_t masklen)
+{
+	uint32_t idx;
+	uint32_t bit;
+
+	/* Index into array */
+	idx = bitidx >> 3;
+	bit = 1 << (bitidx & 0x7);
+
+	if (idx >= masklen)
+		return -1;
+
+	mask[idx] &= ~bit;
+
+	return 0;
+}
+
+
+/**
+ * Set a bit in a bitmap / bitmask.
+ *
+ * @param mask		Bitmask to modify.
+ * @param bitidx	Bit to modify.
+ * @param masklen	Bitmask length (in uint8_t units).
+ * @return		-1 if the index exceeds the number of bits in the
+ *			bitmap, otherwise 0.
+ */
+int
+set_bit(uint8_t *mask, uint32_t bitidx, uint32_t masklen)
+{
+	uint32_t idx;
+	uint32_t bit;
+
+	/* Index into array */
+	idx = bitidx >> 3;
+	bit = 1 << (bitidx & 0x7);
+
+	if (idx >= masklen)
+		return -1;
+
+	mask[idx] |= bit;
+
+	return 0;
+}
+
+
+/**
+ * Check the status of a bit in a bitmap / bitmask.
+ *
+ * @param mask		Bitmask to check.
+ * @param bitidx	Bit to to check.
+ * @param masklen	Bitmask length (in uint8_t units).
+ * @return		-1 if the index exceeds the number of bits in the
+ *			bitmap, 0 if not set, or 1 if set.
+ */
+int
+is_bit_set(uint8_t *mask, uint32_t bitidx, uint32_t masklen)
+{
+	uint32_t idx;
+	uint32_t bit;
+
+	/* Index into array */
+	idx = bitidx >> 3;
+	bit = 1 << (bitidx & 0x7);
+
+	if (idx >= masklen)
+		return -1;
+
+	return !!(mask[idx]&bit);
+}
+
+
/cvs/cluster/cluster/cman/qdisk/clulog.c,v  -->  standard output
revision 1.2.2.1
--- cluster/cman/qdisk/clulog.c
+++ -	2006-07-21 18:01:39.508895000 +0000
@@ -0,0 +1,296 @@
+/*
+  Copyright Red Hat, Inc. 2002
+  Copyright Mission Critical Linux, 2000
+
+  This program is free software; you can redistribute it and/or modify it
+  under the terms of the GNU General Public License as published by the
+  Free Software Foundation; either version 2, or (at your option) any
+  later version.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; see the file COPYING.  If not, write to the
+  Free Software Foundation, Inc.,  675 Mass Ave, Cambridge, 
+  MA 02139, USA.
+*/
+/** @file
+ * Library routines for communicating with the logging daemon.
+ *
+ *  $Id: clulog.c,v 1.2.2.1 2006/07/21 18:01:38 lhh Exp $
+ *
+ *  Author: Jeff Moyer <moyer at missioncriticallinux.com>
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <stdarg.h>
+#include <malloc.h>
+#include <dirent.h>
+#include <signal.h>
+#include <sys/errno.h>
+#include <sys/types.h>
+#include <sys/file.h>
+#include <sys/socket.h>
+#include <ccs.h>
+#define SYSLOG_NAMES
+#include <sys/syslog.h>
+#undef SYSLOG_NAMES
+
+#include <sys/wait.h>
+#include <sys/types.h>
+#include <linux/unistd.h>
+#include <pthread.h>
+#include <gettid.h>
+#include <clulog.h>
+#include <string.h>
+
+
+static const char *version __attribute__ ((unused)) = "$Revision: 1.2.2.1 $";
+
+#ifdef DEBUG
+#include <assert.h>
+#define Dprintf(fmt,args...) printf(fmt,##args)
+#define DBG_ASSERT(x)  assert(x)
+#else
+#define Dprintf(fmt,args...)
+#define DBG_ASSERT(x)
+#endif
+
+/*
+ * Globals
+ */
+static int   log_is_open = 0;
+static int   useconsole = 0;
+static int   loglevel = LOGLEVEL_DFLT;
+static int   syslog_facility = LOG_DAEMON;
+static char  *daemon_name = NULL;
+static pid_t daemon_pid = -1;
+static pthread_mutex_t log_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+CODE logger_prioritynames[] = 
+{ {"emerg", LOG_EMERG},
+  {"alert", LOG_ALERT},
+  {"crit", LOG_CRIT},
+  {"err", LOG_ERR},
+  {"warning", LOG_WARNING},
+  {"notice", LOG_NOTICE},
+  {"info", LOG_INFO},
+  {"debug", LOG_DEBUG}
+};
+
+/*
+ *  Exported Functions.
+ */
+
+/**
+ * @return The current cluster log level.
+ */
+int
+clu_get_loglevel(void)
+{
+	return loglevel;
+}
+
+
+/**
+ * Set the cluster log level.
+ *
+ * @param severity	New log level.
+ * @return 		Old log level, or -1 if 'severity' is an invalid log
+ *			level.
+ */
+int
+clu_set_loglevel(int severity)
+{
+	int ret = loglevel;
+
+	if (severity > 0) {
+		loglevel = severity;
+		return ret;
+	}
+
+	return -1;
+}
+
+
+/**
+ * @return The current cluster log facility.
+ */
+char *
+clu_get_facility(void)
+{
+	int x = 0;
+
+	pthread_mutex_lock(&log_mutex);
+	for (; facilitynames[x].c_name; x++) {
+		if (syslog_facility == facilitynames[x].c_val) {
+			pthread_mutex_unlock(&log_mutex);
+			return facilitynames[x].c_name;
+		}
+	}
+	
+	pthread_mutex_unlock(&log_mutex);
+	return "local4";
+}
+
+
+/**
+ * Set the cluster log facility.
+ *
+ * @param facilityname  New log facility (see /usr/include/sys/syslog.h).
+ * @return 		0
+ */
+int
+clu_set_facility(char *facilityname)
+{
+	int x = 0, old;
+
+	pthread_mutex_lock(&log_mutex);
+	old = syslog_facility;
+
+	for (; facilitynames[x].c_name; x++) {
+		if (strcmp(facilityname, facilitynames[x].c_name))
+			continue;
+
+		syslog_facility = facilitynames[x].c_val;
+		break;
+	}
+
+	if (syslog_facility == old) {
+		pthread_mutex_unlock(&log_mutex);
+		return 0;
+	}
+
+	closelog();
+	log_is_open = 0;
+	pthread_mutex_unlock(&log_mutex);
+	return 0;
+}
+
+
+/**
+ * Set the console logging mode.  Does not work for daemons.
+ *
+ * @param onoff		0 = off, otherwise on.
+ * @return		Old log-to-console state.
+ */
+int
+clu_log_console(int onoff)
+{
+	int ret = useconsole;
+
+	useconsole = !!onoff;
+	return ret;
+}
+
+
+/**
+ * Cluster logging function.  Talks to syslog and writes to the
+ * console, if necessary.
+ */
+int
+do_clulog(int        severity,
+	  int        write_to_cons,
+	  pid_t      pid,
+	  char       *prog,
+	  const char *fmt, ...)
+{
+	va_list      args;
+	char         logmsg[MAX_LOGMSG_LEN];	/* message to go to the log */
+	char         printmsg[MAX_LOGMSG_LEN];	/* message to go to stdout */
+	int          syslog_flags = LOG_NDELAY;
+
+	pthread_mutex_lock(&log_mutex);
+	if (severity > loglevel) {
+		pthread_mutex_unlock(&log_mutex);
+		return 0;
+	}
+
+	memset(logmsg, 0, MAX_LOGMSG_LEN);
+	memset(printmsg, 0, MAX_LOGMSG_LEN);
+
+	/*
+	 * Check to see if the caller has forked.
+	 */
+	if (!pid) {
+
+		/* Use thread IDs */
+		if (daemon_pid != gettid()) {
+
+			daemon_pid = gettid();
+			log_is_open = 0;
+		}
+
+		syslog_flags |= LOG_PID;
+
+	} else {
+
+		daemon_pid = pid;
+		closelog();
+		log_is_open = 0;
+		snprintf(logmsg, MAX_LOGMSG_LEN, "[%d]: ", pid);
+	}
+
+	if (prog) {
+
+		if (daemon_name) {
+
+			free(daemon_name);
+			daemon_name = NULL;
+		}
+
+		daemon_name = strdup(prog);
+	}
+
+	if (!log_is_open) {
+
+		openlog(daemon_name, syslog_flags, syslog_facility);
+		log_is_open = 1;
+	}
+	/*
+	 * Note: This can be called in the context of a CGI program, in which
+	 * case anything printed to stdout goes to the web page.  This can
+	 * cause problems if we have our standard <warning> strings b/c
+	 * the web client will try to interpret this as an html tag.
+	 */
+	snprintf(logmsg + strlen(logmsg), MAX_LOGMSG_LEN - strlen(logmsg), 
+		 "<%s> ", logger_prioritynames[severity].c_name);
+
+	va_start(args, fmt);
+	vsnprintf(logmsg + strlen(logmsg), MAX_LOGMSG_LEN - strlen(logmsg), 
+		  fmt, args);
+	va_end(args);
+
+	if (write_to_cons || useconsole) {
+		snprintf(printmsg, MAX_LOGMSG_LEN, "[%d] %s: ", daemon_pid,
+			 logger_prioritynames[severity].c_name);
+
+		va_start(args, fmt);
+		vsnprintf(printmsg + strlen(printmsg),
+			  MAX_LOGMSG_LEN - strlen(printmsg), fmt, args);
+		va_end(args);
+
+		fprintf(stdout, "%s", printmsg);
+	}
+
+	syslog(severity, logmsg);
+
+	pthread_mutex_unlock(&log_mutex);
+
+	return 0;
+}
+
+
+/**
+ * Stop the cluster logging facility.
+ */
+void
+clulog_close(void)
+{
+	closelog();
+}
/cvs/cluster/cluster/cman/qdisk/clulog.h,v  -->  standard output
revision 1.2.2.1
--- cluster/cman/qdisk/clulog.h
+++ -	2006-07-21 18:01:39.595061000 +0000
@@ -0,0 +1,161 @@
+/*
+  Copyright Red Hat, Inc. 2002
+  Copyright Mission Critical Linux, 2000
+
+  This program is free software; you can redistribute it and/or modify it
+  under the terms of the GNU General Public License as published by the
+  Free Software Foundation; either version 2, or (at your option) any
+  later version.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; see the file COPYING.  If not, write to the
+  Free Software Foundation, Inc.,  675 Mass Ave, Cambridge, 
+  MA 02139, USA.
+*/
+/** @file
+ * Header for clulog.c
+ */
+/*
+ *  author: Jeff Moyer <moyer at missioncriticallinux.com>
+ */
+
+#ifndef __CLUSTER_LOG_H
+#define __CLUSTER_LOG_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <syslog.h>
+#include <sys/types.h>
+
+#define LOGLEVEL_DFLT         LOG_INFO
+#define MAX_LOGMSG_LEN        512
+
+/*
+ * int clu_set_loglevel(int severity)
+ *
+ * DESCRIPTION
+ *   Set the logging level for this daemon.  This is not a 
+ *   system-wide setting.
+ *
+ * ARGUMENTS
+ *   severity  Severity as documented in sys/syslog.h (i.e. LOG_ERR)
+ *
+ * RETURN VALUES
+ *   On success, the previous loglevel is returned.  On error -1 is returned.
+ *
+ * NOTES
+ *   The only way of generating errors for this call is to give a negative
+ *   value for severity.  Currently, syslog lists severities up to 8, but
+ *   I see no reason for this restriction if, in the future, we decided to
+ *   add more levels.  Thus, any number up to MAXINT will be supported.
+ */
+int clu_set_loglevel(int severity);
+int clu_set_facility(char *facility);
+int clu_log_console(int onoff);
+
+/*
+ * int clu_get_loglevel(void)
+ *
+ * DESCRIPTION
+ *   Get the current logging level.
+ *
+ * ARGUMENTS
+ *   none
+ *
+ * RETURN VALUES
+ *   The current logging level is returned.
+ */
+int clu_get_loglevel(void);
+
+/*
+ * DESCRIPTION
+ *   Cluster logging facility.  This is the actual function that does the
+ *   logging.  No one should call this, you should call the wrappers provided.
+ *   i.e. clulog and clulog_and_print.
+ */
+int do_clulog(int severity, int write_to_cons, pid_t pid,
+	      char *prog, const char *fmt, ...);
+/*
+ * int clulog(int severity, const char *fmt, ...)
+ *
+ * DESCRIPTION
+ *   Cluster logging facility.  This is a library routine which sends the 
+ *   supplied parameters to the syslog daemon.  If the supplied severity is 
+ *   numerically larger than the current loglevel, the message is never sent 
+ *   to the log.
+ *
+ * ARGUMENTS
+ *   severity  Severity as documented in sys/syslog.h (i.e. LOG_ERR)
+ *   fmt       Format string as used with printf.
+ *
+ * RETURN VALUES
+ *   On success, 0 is returned.  On error, -1 is returned.
+ *
+ * NOTES
+ *   Inability to contact the logging daemon is the only source of error
+ *   for this function.  Thus, it would behoove you to try a clulog before
+ *   daemonizing your process.  If it fails, print a message to stderr
+ *   explaining that the cluster logging daemon should probably be started.
+ *   If you really want your message to be heard by someone, use
+ *   clulog_and_print().
+ */
+#define clulog(x,fmt,args...)              do_clulog(x,0,0,NULL,fmt,##args)
+#define clulog_pid(x,pid,prog,fmt,args...) do_clulog(x,0,pid,prog,fmt,##args)
+
+/*
+ * int clulog_and_print(int severity, int write_to_cons, const char *fmt, ...)
+ *
+ * DESCRIPTION
+ *   Cluster logging facility.  This is a library routine which sends the 
+ *   supplied parameters to the syslog daemon.  If the supplied severity is 
+ *   numerically larger than the current loglevel, the message is never sent 
+ *   to the log.  This version also prints the given message to the terminal.
+ *
+ * ARGUMENTS
+ *   severity       Severity as documented in sys/syslog.h (i.e. LOG_ERR)
+ *   fmt            Format string as used with printf.
+ *
+ * RETURN VALUES
+ *   On success, 0 is returned.  On error, -1 is returned.
+ */
+#define clulog_and_print(x,fmt,args...)   do_clulog(x,1,0,NULL,fmt,##args)
+
+
+/*
+ * void clulog_close(void)
+ *
+ * DESCRIPTION
+ *   This is an optional call to close the logfile.  This translates into a
+ *   closelog() call.
+ *
+ * ARGUMENTS
+ *   none
+ *
+ * RETURN VALUES
+ *   This function does not return anything.
+ */
+void clulog_close(void);
+
+
+#ifdef __cplusplus
+}
+#endif
+#endif				/* __CLUSTER_LOG_H */
+/*
+ * Local variables:
+ *  c-basic-offset: 8
+ *  c-indent-level: 8
+ *  tab-width: 8
+ * End:
+ */
/cvs/cluster/cluster/cman/qdisk/crc32.c,v  -->  standard output
revision 1.2.2.1
--- cluster/cman/qdisk/crc32.c
+++ -	2006-07-21 18:01:39.679326000 +0000
@@ -0,0 +1,125 @@
+/*
+ * Copyright (C) 2000 Bryan Call <bc at fodder.org>
+ *
+ * Modified by Lon H. Hohberger <lhh at redhat.com>
+ * Copyright (C) 2003 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+/** @file
+ * Calculates CRC32s on data.
+ */
+
+#include <stdint.h>
+#include <sys/types.h>
+#include <stdio.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+static const unsigned long crctable[256] = {
+  0x00000000, 0x77073096, 0xee0e612c, 0x990951ba,
+  0x076dc419, 0x706af48f, 0xe963a535, 0x9e6495a3,
+  0x0edb8832, 0x79dcb8a4, 0xe0d5e91e, 0x97d2d988,
+  0x09b64c2b, 0x7eb17cbd, 0xe7b82d07, 0x90bf1d91,
+  0x1db71064, 0x6ab020f2, 0xf3b97148, 0x84be41de,
+  0x1adad47d, 0x6ddde4eb, 0xf4d4b551, 0x83d385c7,
+  0x136c9856, 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec,
+  0x14015c4f, 0x63066cd9, 0xfa0f3d63, 0x8d080df5,
+  0x3b6e20c8, 0x4c69105e, 0xd56041e4, 0xa2677172,
+  0x3c03e4d1, 0x4b04d447, 0xd20d85fd, 0xa50ab56b,
+  0x35b5a8fa, 0x42b2986c, 0xdbbbc9d6, 0xacbcf940,
+  0x32d86ce3, 0x45df5c75, 0xdcd60dcf, 0xabd13d59,
+  0x26d930ac, 0x51de003a, 0xc8d75180, 0xbfd06116,
+  0x21b4f4b5, 0x56b3c423, 0xcfba9599, 0xb8bda50f,
+  0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924,
+  0x2f6f7c87, 0x58684c11, 0xc1611dab, 0xb6662d3d,
+  0x76dc4190, 0x01db7106, 0x98d220bc, 0xefd5102a,
+  0x71b18589, 0x06b6b51f, 0x9fbfe4a5, 0xe8b8d433,
+  0x7807c9a2, 0x0f00f934, 0x9609a88e, 0xe10e9818,
+  0x7f6a0dbb, 0x086d3d2d, 0x91646c97, 0xe6635c01,
+  0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e,
+  0x6c0695ed, 0x1b01a57b, 0x8208f4c1, 0xf50fc457,
+  0x65b0d9c6, 0x12b7e950, 0x8bbeb8ea, 0xfcb9887c,
+  0x62dd1ddf, 0x15da2d49, 0x8cd37cf3, 0xfbd44c65,
+  0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2,
+  0x4adfa541, 0x3dd895d7, 0xa4d1c46d, 0xd3d6f4fb,
+  0x4369e96a, 0x346ed9fc, 0xad678846, 0xda60b8d0,
+  0x44042d73, 0x33031de5, 0xaa0a4c5f, 0xdd0d7cc9,
+  0x5005713c, 0x270241aa, 0xbe0b1010, 0xc90c2086,
+  0x5768b525, 0x206f85b3, 0xb966d409, 0xce61e49f,
+  0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4,
+  0x59b33d17, 0x2eb40d81, 0xb7bd5c3b, 0xc0ba6cad,
+  0xedb88320, 0x9abfb3b6, 0x03b6e20c, 0x74b1d29a,
+  0xead54739, 0x9dd277af, 0x04db2615, 0x73dc1683,
+  0xe3630b12, 0x94643b84, 0x0d6d6a3e, 0x7a6a5aa8,
+  0xe40ecf0b, 0x9309ff9d, 0x0a00ae27, 0x7d079eb1,
+  0xf00f9344, 0x8708a3d2, 0x1e01f268, 0x6906c2fe,
+  0xf762575d, 0x806567cb, 0x196c3671, 0x6e6b06e7,
+  0xfed41b76, 0x89d32be0, 0x10da7a5a, 0x67dd4acc,
+  0xf9b9df6f, 0x8ebeeff9, 0x17b7be43, 0x60b08ed5,
+  0xd6d6a3e8, 0xa1d1937e, 0x38d8c2c4, 0x4fdff252,
+  0xd1bb67f1, 0xa6bc5767, 0x3fb506dd, 0x48b2364b,
+  0xd80d2bda, 0xaf0a1b4c, 0x36034af6, 0x41047a60,
+  0xdf60efc3, 0xa867df55, 0x316e8eef, 0x4669be79,
+  0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236,
+  0xcc0c7795, 0xbb0b4703, 0x220216b9, 0x5505262f,
+  0xc5ba3bbe, 0xb2bd0b28, 0x2bb45a92, 0x5cb36a04,
+  0xc2d7ffa7, 0xb5d0cf31, 0x2cd99e8b, 0x5bdeae1d,
+  0x9b64c2b0, 0xec63f226, 0x756aa39c, 0x026d930a,
+  0x9c0906a9, 0xeb0e363f, 0x72076785, 0x05005713,
+  0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38,
+  0x92d28e9b, 0xe5d5be0d, 0x7cdcefb7, 0x0bdbdf21,
+  0x86d3d2d4, 0xf1d4e242, 0x68ddb3f8, 0x1fda836e,
+  0x81be16cd, 0xf6b9265b, 0x6fb077e1, 0x18b74777,
+  0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c,
+  0x8f659eff, 0xf862ae69, 0x616bffd3, 0x166ccf45,
+  0xa00ae278, 0xd70dd2ee, 0x4e048354, 0x3903b3c2,
+  0xa7672661, 0xd06016f7, 0x4969474d, 0x3e6e77db,
+  0xaed16a4a, 0xd9d65adc, 0x40df0b66, 0x37d83bf0,
+  0xa9bcae53, 0xdebb9ec5, 0x47b2cf7f, 0x30b5ffe9,
+  0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6,
+  0xbad03605, 0xcdd70693, 0x54de5729, 0x23d967bf,
+  0xb3667a2e, 0xc4614ab8, 0x5d681b02, 0x2a6f2b94,
+  0xb40bbe37, 0xc30c8ea1, 0x5a05df1b, 0x2d02ef8d,
+};
+
+
+/**
+ * Calculate CRC32 of a data set.
+ *
+ * @param data		Data set for building CRC32
+ * @param count		Size of data set, in bytes.
+ * @return 		CRC32 of data set.
+ */
+uint32_t clu_crc32(const char *data, size_t count)
+{
+	uint32_t x;
+	uint32_t crc = (uint32_t)~0;
+	
+	for (x = 0; x < count; x++)
+		crc = (crc >> 8) ^ crctable[(crc ^ data[x]) & 0xff];
+
+	if (crc == (uint32_t)~0)
+	       return 0;
+       	return ~crc;
+}
+
+#if 0
+int
+main(int argc, const char **argv)
+{
+	printf("%08x\n",crc32(argv[1],strlen(argv[1])));
+}
+#endif
/cvs/cluster/cluster/cman/qdisk/disk.c,v  -->  standard output
revision 1.4.2.1
--- cluster/cman/qdisk/disk.c
+++ -	2006-07-21 18:01:39.760295000 +0000
@@ -0,0 +1,758 @@
+/*
+  Copyright Red Hat, Inc. 2002-2003, 2006
+  Copyright Mission Critical Linux, 2000
+
+  This program is free software; you can redistribute it and/or modify it
+  under the terms of the GNU General Public License as published by the
+  Free Software Foundation; either version 2, or (at your option) any
+  later version.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR lgPURPOSE.  See the GNU
+  General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; see the file COPYING.  If not, write to the
+  Free Software Foundation, Inc.,  675 Mass Ave, Cambridge, 
+  MA 02139, USA.
+*/
+/** @file
+ * Single-block Raw/Direct I/O Functions
+ */
+/*
+ *  author: Tim Burke <tburke at redhat.com>
+ *  description: Raw IO Interfaces.
+ *
+ * The RAW IO code we are using from 2.2.13 requires user buffers and
+ * disk offsets to be 512 byte aligned.  So this code consists of a 
+ * read and write routine which check to see if the user buffer is 
+ * aligned.  If it isn't a temporary aligned buffer is allocated, a data
+ * copy is performed along with the IO operation itself.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/ioctl.h>
+#include <string.h>
+#include <errno.h>
+#include <disk.h>
+#include <platform.h>
+#include <unistd.h>
+#include <time.h>
+
+static int diskRawRead(int fd, char *buf, int len);
+uint32_t clu_crc32(const char *data, size_t count);
+
+
+/**
+ * Swap the bytes of a shared header so that it's always in big-endian form
+ * when stored on disk.
+ *
+ * @param hdr		Header to encode.
+ */
+static void
+header_encode(shared_header_t *hdr)
+{
+	/* sanity check - LE machine -> already encoded. */
+	if (hdr->h_magic == be_swap32(SHARED_HEADER_MAGIC))
+		return;
+
+	swab32(hdr->h_magic);
+	swab32(hdr->h_hcrc);
+	swab32(hdr->h_dcrc);
+	swab32(hdr->h_length);
+	swab64(hdr->h_view);
+	swab64(hdr->h_timestamp);
+}
+
+
+/**
+ * Swap the bytes of a shared header so that it's always in host-byte order
+ * after we read it.  This should be a macro calling header_encode.
+ *
+ * @param hdr		Header to decode.
+ */
+static void
+header_decode(shared_header_t *hdr)
+{
+	/* sanity check - LE machine -> already decoded. */
+	if (hdr->h_magic == SHARED_HEADER_MAGIC)
+		return;
+
+	swab32(hdr->h_magic);
+	swab32(hdr->h_hcrc);
+	swab32(hdr->h_dcrc);
+	swab32(hdr->h_length);
+	swab64(hdr->h_view);
+	swab64(hdr->h_timestamp);
+}
+
+
+/**
+ * Generate a shared header suitable for storing data.  This includes:
+ * header magic, header crc, data crc, header length, timestamp.
+ * The header CRC is generated *after* the data CRC; so the header,
+ * in effect, ensures that the data CRC is valid before we even look
+ * at the data.  Thus, if the header CRC decodes properly, then we
+ * assume that there's a very very high chance that the data CRC is valid.
+ * If the data CRC doesn't match the data, it's indicative of a problem.
+ *
+ * @param hdr		Preallocated pointer to shared_header_t structure.
+ * @param data		Data to be stored with hdr.
+ * @param count		Size of data.
+ * @return		-1 if CRC32 generation fails, or 0 on success.
+ */
+static int
+header_generate(shared_header_t *hdr, const char *data, size_t count)
+{
+	memset(hdr,0,sizeof(*hdr));
+
+	hdr->h_magic = SHARED_HEADER_MAGIC;
+
+	if (data && count) {
+		hdr->h_dcrc = clu_crc32(data, count);
+		hdr->h_length = (uint32_t)count;
+
+		if (hdr->h_dcrc == 0) {
+			fprintf(stderr, "Invalid CRC32 generated on data!\n");
+			return -1;
+		}
+	}
+
+	hdr->h_timestamp = (uint64_t)time(NULL);
+
+	hdr->h_hcrc = clu_crc32((char *)hdr, sizeof(*hdr));
+	if (hdr->h_hcrc == 0) {
+		fprintf(stderr, "Invalid CRC32 generated on header!\n");
+		return -1;
+	}
+
+	header_encode(hdr);
+
+	return 0;
+}
+
+
+/**
+ * Verify the integrity of a shared header.  Basically, check the CRC32
+ * information against the data and header.  A better name for this would
+ * be "shared_block_verify".
+ *
+ * @param hdr		Preallocated pointer to shared_header_t structure.
+ * @param data		Data to be stored with hdr.
+ * @param count		Size of data.
+ * @return		-1 if CRC32 generation fails, or 0 on success.
+ */
+static int
+header_verify(shared_header_t *hdr, const char *data, size_t count)
+{
+	uint32_t crc;
+	uint32_t bkupcrc;
+
+	header_decode(hdr);
+	/*
+	 * verify the header's CRC32.  Ok, we know it's overkill taking
+	 * the CRC32 of a friggin' 16-byte (12 bytes, really) structure,
+	 * but why not?
+	 */
+	bkupcrc = hdr->h_hcrc;
+	hdr->h_hcrc = 0;
+	crc = clu_crc32((char *)hdr, sizeof(*hdr));
+	hdr->h_hcrc = bkupcrc;
+	if (bkupcrc != crc) {
+#if 0
+		fprintf(stderr, "Header CRC32 mismatch; Exp: 0x%08x "
+			"Got: 0x%08x\n", bkupcrc, crc);
+#endif
+		return -1;
+	}
+
+	/*
+	 * Verify the magic number.
+	 */
+	if (hdr->h_magic != SHARED_HEADER_MAGIC) {
+#if 0
+		fprintf(stderr, "Magic mismatch; Exp: 0x%08x "
+			"Got: 0x%08x\n", SHARED_HEADER_MAGIC, hdr->h_magic);
+#endif
+		return -1;
+	}
+
+	/* 
+	 * If there's no data or no count, or perhaps the length fed in is less
+	 * then the expected length, bail.
+	 */
+	if (!data || !count || (count < hdr->h_length))
+		return 0;
+
+	crc = clu_crc32(data, (count > hdr->h_length) ?
+			hdr->h_length : count);
+
+	if (hdr->h_dcrc != crc) {
+#if 0
+		fprintf(stderr, "Data CRC32 mismatch; Exp: 0x%08x "
+			"Got: 0x%08x\n", hdr->h_dcrc, crc);
+#endif
+		return -1;
+	}
+
+	return 0;
+}
+
+
+
+/*
+ * qdisk_open
+ * Called to open the shared state partition with appropriate mode.
+ * Returns - (the file descriptor), a value >= 0 on success.
+ */
+int
+qdisk_open(char *name)
+{
+	int fd;
+	int retval;
+
+	/*
+	 * Open for synchronous writes to insure all writes go directly
+	 * to disk.
+	 */
+	fd = open(name, O_RDWR | O_SYNC | O_DIRECT);
+	if (fd < 0) {
+		return fd;
+	}
+
+	/* Check to verify that the partition is large enough.*/
+	retval = lseek(fd, END_OF_DISK, SEEK_SET);
+
+	if (retval < 0) {
+		perror("open_partition: seek");
+		return -1;
+	}
+
+	if (retval < END_OF_DISK) {
+		fprintf(stderr, "Partition %s too small\n", name);
+		errno = EINVAL;
+		return -1;
+	}
+
+	/* Set close-on-exec bit */
+        retval = fcntl(fd, F_GETFD, 0);
+        if (retval < 0) {
+                close(fd);
+                return -1;
+        }
+
+        retval |= FD_CLOEXEC;
+        if (fcntl(fd, F_SETFD, retval) < 0) {
+		perror("open_partition: fcntl");
+                close(fd);
+                return -1;
+        }
+
+	return fd;
+}
+
+
+/*
+ * qdisk_close
+ * Closes the shared state disk partition.
+ * Returns - value from close syscall.
+ */
+int
+qdisk_close(int *fd)
+{
+	int retval;
+
+	if (!fd || *fd < 0) {
+		errno = EINVAL;
+		return -1;
+	}
+
+	retval = close(*fd);
+	*fd = -1;
+
+	return retval;
+}
+
+/*
+ * qdisk_validate
+ * Called to verify that the specified device special file representing
+ * the partition appears to be a valid device.
+ * Returns: 0 - success, 1 - failure
+ */
+int
+qdisk_validate(char *name)
+{
+	struct stat stat_st, *stat_ptr;
+	int fd;
+	stat_ptr = &stat_st;
+
+	if (stat(name, stat_ptr) < 0) {
+		perror("stat");
+		return -1;
+	}
+	/*
+	 * Verify that its a block or character special file.
+	 */
+	if (S_ISCHR(stat_st.st_mode) == 0 && S_ISBLK(stat_st.st_mode) == 0) {
+/*
+		errno = EINVAL;
+		return -1;
+*/
+		fprintf(stderr, "Warning: %s is not a block device\n",
+		        name);
+	}
+
+	/*
+	 * Verify read/write permission.
+	 */
+	fd = qdisk_open(name);
+	if (fd < 0) {
+		fprintf(stderr, "%s: open of %s for RDWR failed: %s\n",
+			__FUNCTION__, name, strerror(errno));
+		return -1;
+	}
+	qdisk_close(&fd);
+	return 0;
+}
+
+
+static int
+diskRawReadShadow(int fd, off_t readOffset, char *buf, int len)
+{
+	int ret;
+	shared_header_t *hdrp;
+	char *data;
+	int datalen;
+
+	ret = lseek(fd, readOffset, SEEK_SET);
+	if (ret != readOffset) {
+#if 0
+		fprintf(stderr,
+		       "diskRawReadShadow: can't seek to offset %d.\n",
+		       (int) readOffset);
+#endif
+		errno = ENODATA;
+		return -1;
+	}
+
+	ret = diskRawRead(fd, buf, len);
+	if (ret != len) {
+#if 0
+		fprintf(stderr, "diskRawReadShadow: aligned read "
+		       "returned %d, not %d.\n", ret, len);
+#endif
+		errno = ENODATA;
+		return -1;
+	}
+
+	/* Decode the header portion so we can run a checksum on it. */
+	hdrp = (shared_header_t *)buf;
+	data = (char *)buf + sizeof(*hdrp);
+	swab_shared_header_t(hdrp);
+	datalen = hdrp->h_length;
+
+	if (header_verify(hdrp, data, len)) {
+#if 0
+		fprintf(stderr, "diskRawReadShadow: bad CRC32, "
+		       "fd = %d offset = %d len = %d\n", fd,
+		       (int) readOffset, len);
+#endif
+		errno = EPROTO;
+		return -1;
+	}
+
+	return 0;
+}
+
+
+/*
+ * The RAW IO implementation requires buffers to be 512 byte aligned.
+ * Here we check for alignment and do a bounceio if necessary.
+ */
+static int
+diskRawRead(int fd, char *buf, int len)
+{
+	char *alignedBuf;
+	int readret;
+	int extraLength;
+	int readlen;
+	int bounceNeeded = 1;
+
+	if ((((unsigned long) buf & (unsigned long) 0x3ff) == 0) &&
+	    ((len % 512) == 0)) {
+		bounceNeeded = 0;
+	}
+
+	if (bounceNeeded == 0) {
+		/* Already aligned and even multiple of 512, no bounceio
+		 * required. */
+		return (read(fd, buf, len));
+	}
+
+	if (len > 512) {
+		fprintf(stderr,
+			"diskRawRead: not setup for reads larger than %d.\n",
+		       512);
+		return (-1);
+	}
+	/*
+	 * All IOs must be of size which is a multiple of 512.  Here we
+	 * just add in enough extra to accommodate.
+	 * XXX - if the on-disk offsets don't provide enough room we're cooked!
+	 */
+	extraLength = 0;
+	if (len % 512) {
+		extraLength = 512 - (len % 512);
+	}
+
+	readlen = len;
+	if (extraLength) {
+		readlen += extraLength;
+	}
+
+	readret = posix_memalign((void **)&alignedBuf, 512, 512);
+	if (readret < 0) {
+		return -1;
+	}
+
+	readret = read(fd, alignedBuf, readlen);
+	if (readret > 0) {
+		if (readret > len) {
+			bcopy(alignedBuf, buf, len);
+			readret = len;
+		} else {
+			bcopy(alignedBuf, buf, readret);
+		}
+	}
+
+	free(alignedBuf);
+	if (readret != len) {
+		fprintf(stderr, "diskRawRead: read err, len=%d, readret=%d\n",
+			len, readret);
+	}
+
+	return (readret);
+}
+
+
+/*
+ * The RAW IO implementation requires buffers to be 512 byte aligned.
+ * Here we check for alignment and do a bounceio if necessary.
+ */
+static int
+diskRawWrite(int fd, char *buf, int len)
+{
+	char *alignedBuf;
+	int ret;
+	int extraLength;
+	int writelen;
+	int bounceNeeded = 1;
+
+	if ((((unsigned long) buf & (unsigned long) 0x3ff) == 0) &&
+	    ((len % 512) == 0)) {
+		bounceNeeded = 0;
+	}
+	if (bounceNeeded == 0) {
+		/* Already aligned and even multiple of 512, no bounceio
+		 * required. */
+		return (write(fd, buf, len));
+	}
+
+	if (len > 512) {
+		fprintf(stderr,
+		       "diskRawWrite: not setup for larger than %d.\n",
+		       512);
+		return (-1);
+	}
+
+	/*
+	 * All IOs must be of size which is a multiple of 512.  Here we
+	 * just add in enough extra to accommodate.
+	 * XXX - if the on-disk offsets don't provide enough room we're cooked!
+	 */
+	extraLength = 0;
+	if (len % 512) {
+		extraLength = 512 - (len % 512);
+	}
+
+	writelen = len;
+	if (extraLength) {
+		writelen += extraLength;
+	}
+
+	ret = posix_memalign((void **)&alignedBuf, 512,512);
+	if (ret < 0) {
+		return (-1);
+	}
+
+	bcopy(buf, alignedBuf, len);
+	ret = write(fd, alignedBuf, writelen);
+	if (ret > len) {
+		ret = len;
+	}
+
+	free(alignedBuf);
+	if (ret != len) {
+		fprintf(stderr, "diskRawWrite: write err, len=%d, ret=%dn",
+		       len, ret);
+	}
+
+	return (ret);
+}
+
+
+static int
+diskRawWriteShadow(int fd, __off64_t writeOffset, char *buf, int len)
+{
+	off_t retval_seek;
+	ssize_t retval_write;
+
+	if ((writeOffset < 0) || (len < 0)) {
+		fprintf(stderr,
+		       "diskRawWriteShadow: writeOffset=%08x, "
+		       "len=%08x.\n", (int)writeOffset, len);
+		return (-1);
+	}
+
+	retval_seek = lseek(fd, writeOffset, SEEK_SET);
+	if (retval_seek != writeOffset) {
+		fprintf(stderr,
+		       "diskRawWriteShadow: can't seek to offset %d\n",
+		       (int) writeOffset);
+		return (-1);
+	}
+
+	retval_write = diskRawWrite(fd, buf, len);
+	if (retval_write != len) {
+		if (retval_write == -1) {
+			fprintf(stderr, "%s: %s\n", __FUNCTION__,
+			       strerror(errno));
+		}
+		fprintf(stderr,
+		       "diskRawWriteShadow: aligned write returned %d"
+		       ", not %d\n", (int)retval_write, (int)len);
+		return (-1);
+	}
+
+	return 0;
+}
+
+
+int
+qdisk_read(int fd, __off64_t offset, void *buf, int count)
+{
+	shared_header_t *hdrp;
+	char *data;
+	size_t total;
+	int rv;
+
+	/*
+	 * Calculate the total length of the buffer, including the header.
+	 * Raw blocks are 512 byte aligned.
+	 */
+	total = count + sizeof(shared_header_t);
+	if (total < 512)
+		total = 512;
+
+	/* Round it up */
+	if (total % 512) 
+		total = total + (512 * !!(total % 512)) - (total % 512);
+
+	hdrp = NULL;
+	rv = posix_memalign((void **)&hdrp, sysconf(_SC_PAGESIZE), total);
+	if (rv < 0)
+		return -1;
+
+	if (hdrp == NULL) 
+		return -1;
+
+	data = (char *)hdrp + sizeof(shared_header_t);
+
+	rv = diskRawReadShadow(fd, offset, (char *)hdrp, total);
+	
+	if (rv == -1) {
+		return -1;
+	}
+	
+	/* Copy out the data */
+	memcpy(buf, data, hdrp->h_length);
+
+	/* Zero out the remainder. */
+	if (hdrp->h_length < count) {
+		memset(buf + hdrp->h_length, 0,
+		       count - hdrp->h_length);
+	}
+
+	free(hdrp);
+	return count;
+}
+
+
+int
+qdisk_write(int fd, __off64_t offset, const void *buf, int count)
+{
+	size_t maxsize;
+	shared_header_t *hdrp;
+	char *data;
+	size_t total = 0, rv = -1, psz = 512; //sysconf(_SC_PAGESIZE);
+
+	maxsize = psz - (sizeof(shared_header_t));
+	if (count >= (maxsize + sizeof(shared_header_t))) {
+		printf("error: count %d >= (%d + %d)\n", (int)count,
+		       (int)maxsize, (int)sizeof(shared_header_t));
+		errno = ENOSPC;
+		return -1;
+	}
+
+	/*
+	 * Calculate the total length of the buffer, including the header.
+	 * Raw blocks are 512 byte aligned.
+	 */
+	total = count + sizeof(shared_header_t);
+	if (total < psz)
+		total = psz;
+
+	/* Round it up */
+	if (total % psz) 
+		total = total + (psz * !!(total % psz)) - (total % psz);
+
+	hdrp = NULL;
+	rv = posix_memalign((void **)&hdrp, sysconf(_SC_PAGESIZE), total);
+	if (rv < 0) {
+		perror("posix_memalign");
+		return -1;
+	}
+
+	/* 
+	 * Copy the data into our new buffer
+	 */
+	data = (char *)hdrp + sizeof(shared_header_t);
+	memcpy(data, buf, count);
+
+	if (header_generate(hdrp, buf, count) == -1) {
+		free((char *)hdrp);
+		return -1;
+	}
+	swab_shared_header_t(hdrp);
+
+	/* 
+	 * Locking must be performed elsewhere.  We make no assumptions
+	 * about locking here.
+	 */
+	if (total == psz)
+		rv = diskRawWriteShadow(fd, offset, (char *)hdrp, psz);
+
+	if (rv == -1)
+		perror("diskRawWriteShadow");
+	
+	free((char *)hdrp);
+	if (rv == -1)
+		return -1;
+	return count;
+}
+
+
+static int
+header_init(int fd, char *label)
+{
+	quorum_header_t qh;
+
+	if (qdisk_read(fd, OFFSET_HEADER, &qh, sizeof(qh)) == sizeof(qh)) {
+		swab_quorum_header_t(&qh);
+		if (qh.qh_magic == HEADER_MAGIC_OLD) {
+			printf("Warning: Red Hat Cluster Manager 1.2.x "
+			       "header found\n");
+		} else if (qh.qh_magic == HEADER_MAGIC_NUMBER) {
+			printf("Warning: Initializing previously "
+			       "initialized partition\n");
+		}
+	}
+
+	if (gethostname(qh.qh_updatehost, sizeof(qh.qh_updatehost)) < 0) {
+		perror("gethostname");
+		return -1;
+	}
+
+	/* Copy in the cluster/label name */
+	snprintf(qh.qh_cluster, sizeof(qh.qh_cluster)-1, label);
+
+	if ((qh.qh_timestamp = (uint64_t)time(NULL)) <= 0) {
+		perror("time");
+		return -1;
+	}
+
+	qh.qh_magic = HEADER_MAGIC_NUMBER;
+	swab_quorum_header_t(&qh);
+	if (qdisk_write(fd, OFFSET_HEADER, &qh, sizeof(qh)) != sizeof(qh)) {
+		return -1;
+	}
+
+	return 0;
+}
+
+
+int
+qdisk_init(char *partname, char *label)
+{
+	int fd;
+	status_block_t ps, wps;
+	int nid;
+	time_t t;
+
+	fd = qdisk_validate(partname);
+	if (fd < 0) {
+		perror("qdisk_verify");
+		return -1;
+	}
+
+	fd = qdisk_open(partname);
+	if (fd < 0) {
+		perror("qdisk_open");
+		return -1;
+	}
+
+	if (header_init(fd, label) < 0) {
+		return -1;
+	}
+
+	time(&t);
+
+	ps.ps_magic = STATE_MAGIC_NUMBER;
+	ps.ps_updatenode = 0;
+	ps.pad0 = 0;
+	ps.ps_timestamp = (uint64_t)t;
+	ps.ps_state = (uint8_t)S_NONE;
+	ps.pad1[0] = 0;
+	ps.ps_flags = 0;
+	ps.ps_score = 0;
+	ps.ps_scoremax = 0;
+	ps.ps_ca_sec = 0;
+	ps.ps_ca_usec = 0;
+	ps.ps_lc_sec = 0;
+	ps.ps_ca_usec = 0;
+
+	/* Node IDs 1..N */
+	for (nid = 1; nid <= MAX_NODES_DISK; nid++) {
+		ps.ps_nodeid = nid;
+
+		printf("Initializing status block for node %d...\n", nid);
+		wps = ps;
+		swab_status_block_t(&wps);
+
+		if (qdisk_write(fd, qdisk_nodeid_offset(nid), &wps, sizeof(wps)) < 0) {
+			printf("Error writing node ID block %d\n", nid);
+			qdisk_close(&fd);
+			return -1;
+		}
+	}
+
+	qdisk_close(&fd);
+
+	return 0;
+}
+
/cvs/cluster/cluster/cman/qdisk/disk.h,v  -->  standard output
revision 1.3.2.1
--- cluster/cman/qdisk/disk.h
+++ -	2006-07-21 18:01:39.865378000 +0000
@@ -0,0 +1,269 @@
+/**
+  Copyright Red Hat, Inc. 2006
+
+  This program is free software; you can redistribute it and/or modify it
+  under the terms of the GNU General Public License as published by the
+  Free Software Foundation; either version 2, or (at your option) any
+  later version.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; see the file COPYING.  If not, write to the
+  Free Software Foundation, Inc.,  675 Mass Ave, Cambridge, 
+  MA 02139, USA.
+
+  Author: Lon Hohberger <lhh at redhat.com>
+ */
+/**
+  @file Main quorum daemon include file
+ */
+#ifndef _QUORUM_DISK_H
+#define _QUORUM_DISK_H
+
+#include <stdint.h>
+#include <pthread.h>
+#include <arpa/inet.h>
+#include <libcman.h>
+
+#define MAX_NODES_DISK		16	
+#define MEMB_MASK_LEN           ((MAX_NODES_DISK / 8) + \
+				 (!!(MAX_NODES_DISK % 8)))
+#define DISK_MEMB_MASK_LEN	((MEMB_MASK_LEN + 7) & ~7)
+
+/** The membership bitmask type */
+typedef uint8_t memb_mask_t [DISK_MEMB_MASK_LEN];
+
+typedef enum {
+	S_NONE  = 0x0,		// Shutdown / not quorate / not running
+	S_EVICT	= 0x1,		// Voted out / about to be fenced.
+	/* ^^^ Fencing OK */
+	S_INIT	= 0x2,		// Initializing.  Hold your fire.
+        /* vvv Fencing will kill a node */
+	S_RUN	= 0x5,		// I think I'm running.
+	S_MASTER= 0x6		// I know I'm running, and have advertised to
+				// CMAN the availability of the disk vote for my
+				// partition.
+} disk_node_state_t;
+
+
+typedef enum {
+	M_NONE  = 0x0,
+	M_BID	= 0x1,
+	M_ACK	= 0x2,
+	M_NACK	= 0x3,
+	M_MASK	= 0x4
+} disk_msg_id_t;
+
+
+typedef enum {
+	FL_MSG	= 0x1,
+	FL_BID	= 0x2,
+	FL_VOTE = 0x4
+} disk_state_flag_t;
+
+
+/* RHEL 2.1 / RHCS3 old magic numbers */
+#define HEADER_MAGIC_OLD	0x39119FCD	/* partition header */
+#define STATE_MAGIC_OLD		0xF1840DCE	/* Status block */
+#define SHARED_HEADER_MAGIC_OLD	0x00DEBB1E	/* Per-block header */
+
+/* Conversion */
+#define HEADER_MAGIC_NUMBER	0xeb7a62c2	/* Partition header */
+#define STATE_MAGIC_NUMBER	0x47bacef8	/* Status block */
+#define SHARED_HEADER_MAGIC	0x00DEBB1E	/* Per-block headeer */
+
+
+typedef struct __attribute__ ((packed)) {
+	uint32_t	ps_magic;
+	/* 4 */
+	uint32_t	ps_updatenode;		// Last writer
+	/* 8 */
+	uint64_t	ps_timestamp;		// time of last update
+	/* 16 */
+	uint32_t	ps_nodeid;
+	uint32_t	pad0;
+	/* 24 */
+	uint8_t		ps_state;		// running or stopped
+	uint8_t		pad1[1];
+	uint16_t	ps_flags;
+	/* 26 */
+	uint16_t	ps_score;		// Local points
+	uint16_t	ps_scoremax;		// What we think is our max
+						// points, if other nodes
+						// disagree, we may be voted
+						// out
+	/* 28 */
+	uint32_t	ps_ca_sec;		// Cycle speed (average)
+	uint32_t	ps_ca_usec;
+	/* 36 */
+	uint32_t	ps_lc_sec;		// Cycle speed (last)
+	uint32_t	ps_lc_usec;
+	uint64_t	ps_incarnation;		// Token to detect hung +
+						// restored node
+	/* 44 */
+	uint16_t	ps_msg;			// Vote/bid mechanism 
+	uint16_t	ps_seq;
+	uint32_t	ps_arg;
+	/* 52 */
+	memb_mask_t	ps_mask;		// Bitmap
+	memb_mask_t	ps_master_mask;		// Bitmap
+	/* 60 */
+} status_block_t;
+
+#define swab_status_block_t(ptr) \
+{\
+	swab32((ptr)->ps_magic);\
+	swab32((ptr)->ps_updatenode);\
+	swab64((ptr)->ps_timestamp);\
+	swab32((ptr)->ps_nodeid);\
+	swab32((ptr)->pad0);\
+	/* state + pad */ \
+	swab16((ptr)->ps_flags);\
+	swab16((ptr)->ps_score);\
+	swab16((ptr)->ps_scoremax);\
+	/* Cycle speeds */ \
+	swab32((ptr)->ps_ca_sec);\
+	swab32((ptr)->ps_ca_usec);\
+	swab32((ptr)->ps_lc_sec);\
+	swab32((ptr)->ps_lc_usec);\
+	/* Message */ \
+	swab16((ptr)->ps_msg); \
+	swab16((ptr)->ps_seq); \
+	swab32((ptr)->ps_arg); \
+ }
+
+
+/*
+ * Shared state disk header.  Describes cluster global information.
+ */
+typedef struct __attribute__ ((packed)) {
+	uint32_t	qh_magic;
+	uint32_t	qh_align;	   // 64-bit-ism: alignment fixer.
+	uint64_t	qh_timestamp;	   // time of last update
+	char 		qh_updatehost[128];// Hostname who put this here...
+	char		qh_cluster[128];   // Cluster name
+} quorum_header_t;
+
+#define swab_quorum_header_t(ptr) \
+{\
+	swab32((ptr)->qh_magic); \
+	swab32((ptr)->qh_align); \
+	swab64((ptr)->qh_timestamp); \
+}
+
+
+
+/*
+ * The user data is stored with this header prepended.
+ * The header ONLY contains CRC information and the length of the data.
+ * The data blocks themselves contain their own respective magic numbers.
+ */
+typedef struct __attribute__ ((packed)) {
+	uint32_t h_magic;		/* Header magic	       */
+	uint32_t h_hcrc;		/* Header CRC          */
+	uint32_t h_dcrc;		/* CRC32 of data       */
+	uint32_t h_length;		/* Length of real data */
+	uint64_t h_view;		/* View # of real data */
+	uint64_t h_timestamp;		/* Timestamp           */
+} shared_header_t;
+
+#define SHARED_HEADER_INITIALIZER = {0, 0, 0, 0, 0, 0}
+
+#define swab_shared_header_t(ptr) \
+{\
+	swab32((ptr)->h_magic);\
+	swab32((ptr)->h_hcrc);\
+	swab32((ptr)->h_dcrc);\
+	swab32((ptr)->h_length);\
+	swab64((ptr)->h_view);\
+	swab64((ptr)->h_timestamp);\
+}
+
+
+/* Offsets from RHCM 1.2.x */
+#define OFFSET_HEADER	0
+#define HEADER_SIZE	4096		/* Page size for now */
+
+#define OFFSET_FIRST_STATUS_BLOCK	(OFFSET_HEADER + HEADER_SIZE)
+#define SPACE_PER_STATUS_BLOCK		4096 /* Page size for now */
+#define STATUS_BLOCK_COUNT		MAX_NODES_DISK
+
+#define SPACE_PER_MESSAGE_BLOCK		(4096)
+#define	MESSAGE_BLOCK_COUNT		MAX_NODES_DISK
+
+#define END_OF_DISK			(OFFSET_FIRST_STATUS_BLOCK + \
+					 (MAX_NODES_DISK + 1) * \
+					 SPACE_PER_STATUS_BLOCK) \
+
+
+
+/* From disk.c */
+int qdisk_open(char *name);
+int qdisk_close(int *fd);
+int qdisk_init(char *name, char *clustername);
+int qdisk_validate(char *name);
+int qdisk_read(int fd, __off64_t ofs, void *buf, int len);
+int qdisk_write(int fd, __off64_t ofs, const void *buf, int len);
+
+#define qdisk_nodeid_offset(nodeid) \
+	(OFFSET_FIRST_STATUS_BLOCK + (SPACE_PER_STATUS_BLOCK * (nodeid - 1)))
+
+/* From disk_utils.c */
+#define HISTORY_LENGTH 60
+typedef struct {
+	disk_msg_id_t m_msg;	 /* this is an int, but will be stored as 16bit*/
+	uint32_t m_arg;
+	uint16_t m_seq;
+	uint16_t pad0;
+} disk_msg_t;
+
+typedef struct {
+	uint64_t qc_incarnation;
+	struct timeval qc_average;
+	struct timeval qc_last[HISTORY_LENGTH];
+	int qc_fd;
+	int qc_my_id;
+	int qc_writes;
+	int qc_interval;
+	int qc_tko;
+	int qc_votes;
+	int qc_scoremin;
+	disk_node_state_t qc_disk_status;
+	disk_node_state_t qc_status;
+	int qc_master;		/* Master?! */
+	int qc_unused;
+	cman_handle_t qc_ch;
+	char *qc_device;
+	char *qc_label;
+	char *qc_status_file;
+} qd_ctx;
+
+typedef struct {
+	uint64_t ni_incarnation;
+	uint64_t ni_evil_incarnation;
+	time_t	ni_last_seen;
+	int	ni_misses;
+	int	ni_seen;
+	disk_msg_t ni_msg;
+	disk_msg_t ni_last_msg;
+	disk_node_state_t ni_state;
+	status_block_t ni_status;
+} node_info_t;
+
+int qd_write_status(qd_ctx *ctx, int nid, disk_node_state_t state,
+		    disk_msg_t *msg, memb_mask_t mask, memb_mask_t master);
+int qd_read_print_status(int fd, int nid);
+int qd_init(qd_ctx *ctx, cman_handle_t ch, int me);
+void qd_destroy(qd_ctx *ctx);
+
+/* proc.c */
+int find_partitions(const char *partfile, const char *label,
+		    char *devname, size_t devlen, int print);
+int check_device(char *device, char *label, quorum_header_t *qh);
+
+
+#endif
/cvs/cluster/cluster/cman/qdisk/disk_util.c,v  -->  standard output
revision 1.2.2.1
--- cluster/cman/qdisk/disk_util.c
+++ -	2006-07-21 18:01:39.967981000 +0000
@@ -0,0 +1,293 @@
+/**
+  Copyright Red Hat, Inc. 2006
+
+  This program is free software; you can redistribute it and/or modify it
+  under the terms of the GNU General Public License as published by the
+  Free Software Foundation; either version 2, or (at your option) any
+  later version.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; see the file COPYING.  If not, write to the
+  Free Software Foundation, Inc.,  675 Mass Ave, Cambridge, 
+  MA 02139, USA.
+
+  Author: Lon Hohberger <lhh at redhat.com>
+ */
+/**
+  @file Misc. Quorum daemon context utilities / high-level functions
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/ioctl.h>
+#include <string.h>
+#include <errno.h>
+#include <disk.h>
+#include <platform.h>
+#include <unistd.h>
+#include <sys/time.h>
+#include <time.h>
+
+
+static inline void
+_diff_tv(struct timeval *dest, struct timeval *start, struct timeval *end)
+{
+	        dest->tv_sec = end->tv_sec - start->tv_sec;
+	        dest->tv_usec = end->tv_usec - start->tv_usec;
+
+		if (dest->tv_usec < 0) {
+			dest->tv_usec += 1000000;
+			dest->tv_sec--;
+		}
+}
+
+
+/**
+  Update write times and calculate a new average time
+ */
+void
+qd_update_wtime(qd_ctx *ctx, struct timeval *newtime)
+{
+	int x;
+	int max = HISTORY_LENGTH;
+	uint64_t sum = 0;
+
+	/* Store the thing */
+	ctx->qc_writes++;
+	ctx->qc_last[ctx->qc_writes % HISTORY_LENGTH].tv_sec = newtime->tv_sec;
+	ctx->qc_last[ctx->qc_writes % HISTORY_LENGTH].tv_usec = newtime->tv_usec;
+
+	if (ctx->qc_writes < HISTORY_LENGTH)
+		max = ctx->qc_writes;
+
+	for (x = 0; x < max; x++) {
+		sum += (ctx->qc_last[x].tv_sec * 1000000);
+		sum += ctx->qc_last[x].tv_usec;
+	}
+
+	sum /= max;
+
+	ctx->qc_average.tv_sec = (sum / 1000000);
+	ctx->qc_average.tv_usec = (sum % 1000000);
+}
+
+
+/**
+  Write a status block to disk, given state, nodeid, message, and the
+  membership mask.
+ */
+int
+qd_write_status(qd_ctx *ctx, int nid, disk_node_state_t state,
+		disk_msg_t *msg, memb_mask_t mask, memb_mask_t master)
+{
+	status_block_t ps;
+	struct timeval start, end;
+	int utime_ok = 1;
+
+	if (!ctx) {
+		errno = EINVAL;
+		return -1;
+	}
+
+	if (nid <= 0) {
+		errno = EINVAL;
+		return -1;
+	}
+
+	ps.ps_magic = STATE_MAGIC_NUMBER;
+	ps.ps_nodeid = nid;
+	ps.ps_updatenode = ctx->qc_my_id;
+	ps.pad0 = 0;
+	ps.ps_timestamp = (uint64_t)time(NULL);
+	ps.ps_state = (uint8_t)state;
+	ps.pad1[0] = 0;
+	ps.ps_flags = 0;
+	ps.ps_score = 0;
+	ps.ps_scoremax = 0;
+	ps.ps_ca_sec = ctx->qc_average.tv_sec;
+	ps.ps_ca_usec = ctx->qc_average.tv_usec;
+	ps.ps_incarnation = ctx->qc_incarnation;
+	if (mask) {
+		memcpy(ps.ps_mask, mask, sizeof(memb_mask_t));
+	} else {
+		memset(ps.ps_mask, 0, sizeof(memb_mask_t));
+	}
+	if (master) {
+		memcpy(ps.ps_master_mask, master, sizeof(memb_mask_t));
+	} else {
+		memset(ps.ps_master_mask, 0, sizeof(memb_mask_t));
+	}
+
+	if (ctx->qc_writes) {
+		ps.ps_lc_sec =
+		   ctx->qc_last[(ctx->qc_writes - 1) % HISTORY_LENGTH].tv_sec;
+		ps.ps_lc_usec =
+		   ctx->qc_last[(ctx->qc_writes - 1) % HISTORY_LENGTH].tv_usec;
+	} else {
+		ps.ps_lc_sec = ps.ps_lc_usec = 0;
+	}
+	ps.ps_nodeid = nid;
+
+	/* Argh! */
+	if (msg) {
+		ps.ps_msg = msg->m_msg;
+		ps.ps_seq = msg->m_seq;
+		ps.ps_arg = msg->m_arg;
+	} else {
+		ps.ps_msg = 0;
+		ps.ps_seq = 0;
+		ps.ps_arg = 0;
+	}
+
+	if (gettimeofday(&start, NULL) < 0)
+		utime_ok = 0;
+	swab_status_block_t(&ps);
+	if (qdisk_write(ctx->qc_fd, qdisk_nodeid_offset(nid), &ps,
+			sizeof(ps)) < 0) {
+		printf("Error writing node ID block %d\n", nid);
+		return -1;
+	}
+	if (utime_ok && (gettimeofday(&end, NULL) < 0))
+		utime_ok = 0;
+
+	if (utime_ok) {
+		_diff_tv(&start,&start,&end);
+	} else {
+		/* Use heuristic */
+		start.tv_sec = ctx->qc_average.tv_sec;
+		start.tv_usec = ctx->qc_average.tv_usec;
+	}
+	qd_update_wtime(ctx, &start);
+
+	return 0;
+}
+
+
+int
+qd_print_status(status_block_t *ps)
+{
+	int x;
+
+	printf("Data @ offset %d:\n",
+	       (int)qdisk_nodeid_offset(ps->ps_nodeid));
+	printf("status_block_t {\n");
+	printf("\t.ps_magic = %08x;\n", (int)ps->ps_magic);
+	printf("\t.ps_nodeid = %d;\n", (int)ps->ps_nodeid);
+	printf("\t.ps_updatenode = %d;\n", (int)ps->ps_updatenode);
+	printf("\t.pad0 = %d;\n", (int)ps->pad0);
+	printf("\t.ps_timestamp = %llu;\n", (long long unsigned)
+		ps->ps_timestamp);
+	printf("\t.ps_state = %d;\n", ps->ps_state);
+	printf("\t.pad1[0] = %d;\n", ps->pad1[0]);
+	printf("\t.ps_flags = %d;\n", ps->ps_flags);
+	printf("\t.ps_score = %d;\n", ps->ps_score);
+	printf("\t.ps_scoremax = %d;\n", ps->ps_scoremax);
+	printf("\t.ps_ca_sec = %d;\n", ps->ps_ca_sec);
+	printf("\t.ps_ca_usec = %d;\n", ps->ps_ca_usec);
+	printf("\t.ps_lc_sec = %d;\n", ps->ps_lc_sec);
+	printf("\t.ps_lc_usec = %d;\n", ps->ps_lc_usec);
+	printf("\t.ps_mask = 0x");
+	for (x = (sizeof(memb_mask_t)-1); x >= 0; x--)
+		printf("%02x", ps->ps_mask[x]);
+	printf("\n");
+	printf("\t.ps_master_mask = 0x");
+	for (x = (sizeof(memb_mask_t)-1); x >= 0; x--)
+		printf("%02x", ps->ps_mask[x]);
+	printf("\n");
+
+	printf("}\n");
+
+	return 0;
+}
+
+
+int
+qd_read_print_status(int fd, int nid)
+{
+	status_block_t ps;
+
+	if (fd < 0) {
+		errno = EINVAL;
+		return -1;
+	}
+
+	if (nid <= 0) {
+		errno = EINVAL;
+		return -1;
+	}
+
+	if (qdisk_read(fd, qdisk_nodeid_offset(nid), &ps,
+			sizeof(ps)) < 0) {
+		printf("Error reading node ID block %d\n", nid);
+		return -1;
+	}
+	swab_status_block_t(&ps);
+	qd_print_status(&ps);
+
+	return 0;
+}
+
+
+/**
+  Generate a token based on the current system time.
+ */
+uint64_t
+generate_token(void)
+{
+	uint64_t my_token = 0;
+	struct timeval tv;
+
+        while(my_token == 0) {
+                gettimeofday(&tv, NULL);
+
+                my_token = ((uint64_t) (tv.tv_sec) << 32) |
+                        (uint64_t) (tv.tv_sec & 0x00000000ffffffff);
+        }
+
+	return my_token;
+}
+
+
+/**
+  Initialize a quorum disk context, given a CMAN handle and a nodeid.
+ */
+int
+qd_init(qd_ctx *ctx, cman_handle_t ch, int me)
+{
+	if (!ctx || !ch || !me) {
+		errno = EINVAL;
+		return -1;
+	}	
+
+	memset(ctx, 0, sizeof(*ctx));
+	ctx->qc_incarnation = generate_token();
+	ctx->qc_ch = ch;
+	ctx->qc_my_id = me;
+
+	return 0;
+}
+
+
+/**
+  Destroy a quorum disk context
+ */
+void
+qd_destroy(qd_ctx *ctx)
+{
+	if (ctx->qc_my_id == 0)
+		return;
+	if (ctx->qc_device) {
+		free(ctx->qc_device);
+		ctx->qc_device = NULL;
+	}
+	close(ctx->qc_fd);
+	ctx->qc_fd = -1;
+}
/cvs/cluster/cluster/cman/qdisk/gettid.c,v  -->  standard output
revision 1.4.2.1
--- cluster/cman/qdisk/gettid.c
+++ -	2006-07-21 18:01:40.062745000 +0000
@@ -0,0 +1,24 @@
+#include <sys/types.h>
+#include <sys/syscall.h>
+#include <linux/unistd.h>
+#include <gettid.h>
+#include <errno.h>
+#include <unistd.h>
+
+/* Patch from Adam Conrad / Ubuntu: Don't use _syscall macro */
+
+#ifdef __NR_gettid
+pid_t gettid (void)
+{
+	return syscall(__NR_gettid);
+}
+#else
+
+#warn "gettid not available -- substituting with pthread_self()"
+
+#include <pthread.h>
+pid_t gettid (void)
+{
+	return (pid_t)pthread_self();
+}
+#endif
/cvs/cluster/cluster/cman/qdisk/gettid.h,v  -->  standard output
revision 1.2.2.1
--- cluster/cman/qdisk/gettid.h
+++ -	2006-07-21 18:01:40.159682000 +0000
@@ -0,0 +1,7 @@
+#ifndef __GETTID_H
+#define __GETTID_H
+
+pid_t gettid(void);
+
+#endif
+
/cvs/cluster/cluster/cman/qdisk/main.c,v  -->  standard output
revision 1.3.2.1
--- cluster/cman/qdisk/main.c
+++ -	2006-07-21 18:01:40.252627000 +0000
@@ -0,0 +1,1026 @@
+/**
+  Copyright Red Hat, Inc. 2006
+
+  This program is free software; you can redistribute it and/or modify it
+  under the terms of the GNU General Public License as published by the
+  Free Software Foundation; either version 2, or (at your option) any
+  later version.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; see the file COPYING.  If not, write to the
+  Free Software Foundation, Inc.,  675 Mass Ave, Cambridge, 
+  MA 02139, USA.
+
+  Author: Lon Hohberger <lhh at redhat.com>
+ */
+/**
+  @file Main loop / functions for disk-based quorum daemon.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/ioctl.h>
+#include <string.h>
+#include <errno.h>
+#include <disk.h>
+#include <platform.h>
+#include <unistd.h>
+#include <time.h>
+#include <sys/reboot.h>
+#include <linux/reboot.h>
+#include <signal.h>
+#include <ccs.h>
+#include "score.h"
+#include "clulog.h"
+/*
+  TODO:
+  1) Take into account timings to gracefully extend node timeouts during 
+     node spikes (that's why they are there!).
+  2) Poll ccsd for configuration changes.
+  3) Logging.
+ */
+
+/* From bitmap.c */
+int clear_bit(uint8_t *mask, uint32_t bitidx, uint32_t masklen);
+int set_bit(uint8_t *mask, uint32_t bitidx, uint32_t masklen);
+int is_bit_set(uint8_t *mask, uint32_t bitidx, uint32_t masklen);
+static int _running = 0;
+
+
+static void
+int_handler(int sig)
+{
+	_running = 0;
+}
+
+
+/**
+  Simple thing to see if a node is running.
+ */
+inline int
+state_run(disk_node_state_t state)
+{
+	return (state >= S_INIT ? state : 0);
+}
+
+
+/**
+  Clear out / initialize node info block.
+ */
+void
+node_info_init(node_info_t *ni, int max)
+{
+	int x;
+	time_t t = time(NULL);
+
+	memset(ni, 0, sizeof(*ni) * max);
+	for (x = 0; x < max; x++) {
+		ni[x].ni_status.ps_nodeid = (x + 1); /* node ids are 1-based */
+		ni[x].ni_status.ps_timestamp = t;
+		ni[x].ni_misses = 0;
+		ni[x].ni_last_seen = t;
+	}
+}
+
+
+/**
+  Check to see if someone tried to evict us but we were out to lunch.
+  Rare case; usually other nodes would put up the 'Undead' message and
+  re-evict us.
+ */
+void
+check_self(qd_ctx *ctx, status_block_t *sb)
+{
+	if (!sb->ps_updatenode ||
+	    (sb->ps_updatenode == ctx->qc_my_id)) {
+		return;
+	}
+
+	/* I did not update this??! */
+	switch(sb->ps_state) {
+	case S_EVICT:
+		/* Someone told us to die. */
+		reboot(RB_AUTOBOOT);
+	default:
+		clulog(LOG_EMERG, "Unhandled state: %d\n", sb->ps_state);
+		raise(SIGSTOP);
+	}
+}
+
+
+/**
+  Read in the node blocks off of the quorum disk and see if anyone has
+  or has not updated their timestamp recently.  See check_transitions as
+  well.
+ */
+void
+read_node_blocks(qd_ctx *ctx, node_info_t *ni, int max)
+{
+	int x;
+	status_block_t *sb;
+
+	for (x = 0; x < max; x++) {
+
+		sb = &ni[x].ni_status;
+
+		if (qdisk_read(ctx->qc_fd, qdisk_nodeid_offset(x+1),
+			       sb, sizeof(*sb)) < 0) {
+			clulog(LOG_WARNING,"Error reading node ID block %d\n",
+			       x+1);
+		}
+		swab_status_block_t(sb);
+
+		if (sb->ps_nodeid == ctx->qc_my_id) {
+			check_self(ctx, sb);
+			continue;
+		} 
+		/* message. */
+		ni[x].ni_msg.m_arg = sb->ps_arg;
+		ni[x].ni_msg.m_msg = sb->ps_msg;
+		ni[x].ni_msg.m_seq = sb->ps_seq;
+
+		if (!state_run(sb->ps_state))
+			continue;
+
+		/* Unchanged timestamp: miss */
+		if (sb->ps_timestamp == ni[x].ni_last_seen) {
+			/* XXX check for average + allow grace */
+			ni[x].ni_misses++;
+			continue;
+		}
+
+		/* Got through?  The node is good. */
+		ni[x].ni_misses = 0;
+		ni[x].ni_seen++;
+		ni[x].ni_last_seen = sb->ps_timestamp;
+	}
+}
+
+
+/**
+  Check for node transitions.
+ */
+void
+check_transitions(qd_ctx *ctx, node_info_t *ni, int max, memb_mask_t mask)
+{
+	int x;
+
+	if (mask)
+		memset(mask, 0, sizeof(memb_mask_t));
+
+	for (x = 0; x < max; x++) {
+
+		/*
+		   Case 1: check to see if the node is still up
+		   according to our internal state, but has been
+		   evicted by the master or cleanly shut down
+		   (or restarted).
+
+		   Transition from Evicted/Shutdown -> Offline
+		 */
+		if ((ni[x].ni_state >= S_EVICT &&
+		     ni[x].ni_status.ps_state <= S_EVICT) ||
+		     (ni[x].ni_incarnation &&
+		      (ni[x].ni_incarnation !=
+		       ni[x].ni_status.ps_incarnation))) {
+
+			if (ni[x].ni_status.ps_state == S_EVICT) {
+				clulog(LOG_NOTICE, "Node %d evicted\n",
+				       ni[x].ni_status.ps_nodeid);
+			} else {
+				/* State == S_NONE or incarnation change */
+				clulog(LOG_INFO, "Node %d shutdown\n",
+				       ni[x].ni_status.ps_nodeid);
+				ni[x].ni_evil_incarnation = 0;
+			}
+
+			ni[x].ni_incarnation = 0;
+			ni[x].ni_seen = 0;
+			ni[x].ni_misses = 0;
+			ni[x].ni_state = S_NONE;
+
+			continue;
+		}
+
+		/*
+		   Case 2: Check for a heartbeat timeout.  Write an eviction
+		   notice if we're the master.  If this is our first notice
+		   of the heartbeat timeout, update our internal state
+		   accordingly.  When the master evicts this node, we will
+		   hit case 1 above.
+
+		   Transition from Online -> Evicted
+		 */
+		if (ni[x].ni_misses > ctx->qc_tko &&
+		     state_run(ni[x].ni_status.ps_state)) {
+
+			/*
+			   Write eviction notice if we're the master.
+			 */
+			if (ctx->qc_status == S_MASTER) {
+				clulog(LOG_DEBUG,
+				       "Writing eviction notice for node %d\n",
+				       ni[x].ni_status.ps_nodeid);
+				qd_write_status(ctx, ni[x].ni_status.ps_nodeid,
+						S_EVICT, NULL, NULL, NULL);
+				clulog(LOG_DEBUG,
+				       "Telling CMAN to kill the node\n");
+				cman_kill_node(ctx->qc_ch,
+					       ni[x].ni_status.ps_nodeid);
+			}
+
+			/*
+			   Mark our internal views as dead if nodes miss too
+			   many heartbeats...  This will cause a master
+			   transition if no live master exists.
+			 */
+			if (ni[x].ni_status.ps_state >= S_RUN &&
+			    ni[x].ni_seen) {
+				clulog(LOG_DEBUG, "Node %d DOWN\n",
+				       ni[x].ni_status.ps_nodeid);
+				ni[x].ni_seen = 0;	
+			}
+
+			ni[x].ni_state = S_EVICT;
+			ni[x].ni_status.ps_state = S_EVICT;
+			ni[x].ni_evil_incarnation = 
+				ni[x].ni_status.ps_incarnation;
+			
+			continue;
+		}
+
+		/*
+		   Case 3:  Check for node who is supposed to be dead, but
+		   has started writing to the disk again with the same
+		   incarnation.  
+
+		   Transition from Offline -> Undead (BAD!!!)
+		 */
+		if (ni[x].ni_evil_incarnation &&
+                    (ni[x].ni_evil_incarnation == 
+		     ni[x].ni_status.ps_incarnation)) {
+			clulog(LOG_CRIT, "Node %d is undead.\n",
+			       ni[x].ni_status.ps_nodeid);
+
+			clulog(LOG_ALERT,
+			       "Writing eviction notice for node %d\n",
+			       ni[x].ni_status.ps_nodeid);
+			qd_write_status(ctx, ni[x].ni_status.ps_nodeid,
+					S_EVICT, NULL, NULL, NULL);
+			ni[x].ni_status.ps_state = S_EVICT;
+
+			/* XXX Need to fence it again */
+			clulog(LOG_DEBUG, "Telling CMAN to kill the node\n");
+			cman_kill_node(ctx->qc_ch,
+				       ni[x].ni_status.ps_nodeid);
+			continue;
+		}
+
+
+		/*
+		   Case 4:  Check for a node who has met our minimum # of
+		   'seen' requests.
+
+		   Transition from Offline -> Online
+		 */
+		if (ni[x].ni_seen > (ctx->qc_tko / 2) &&
+		    !state_run(ni[x].ni_state)) {
+			/*
+			   Node-join - everyone just kind of "agrees"
+			   there's no consensus to just have a node join
+			   right now.
+			 */
+			ni[x].ni_state = S_RUN;
+			clulog(LOG_DEBUG, "Node %d is UP\n",
+			       ni[x].ni_status.ps_nodeid);
+			ni[x].ni_incarnation =
+			    ni[x].ni_status.ps_incarnation;
+			if (mask)
+				set_bit(mask, (ni[x].ni_status.ps_nodeid-1),
+					sizeof(memb_mask_t));
+
+			continue;
+		}
+
+		/*
+		   Case 5: Check for a node becoming master.  Not really a
+		   transition.
+		 */
+		if (ni[x].ni_state == S_RUN &&
+		    ni[x].ni_status.ps_state == S_MASTER) {
+			clulog(LOG_INFO, "Node %d is the master\n",
+			       ni[x].ni_status.ps_nodeid);
+			ni[x].ni_state = S_MASTER;
+			if (mask)
+				set_bit(mask, (ni[x].ni_status.ps_nodeid-1),
+					sizeof(memb_mask_t));
+			continue;
+		}
+
+		/*
+		   All other cases: Believe the node's reported state ;)
+		 */
+		if (state_run(ni[x].ni_state)) {
+			ni[x].ni_state = ni[x].ni_status.ps_state;
+			if (mask)
+				set_bit(mask, (ni[x].ni_status.ps_nodeid-1),
+					sizeof(memb_mask_t));
+		}
+	}
+}
+
+
+/**
+  Checks for presence of an online master.  If there is no
+  Returns
+ */
+int
+master_exists(qd_ctx *ctx, node_info_t *ni, int max, int *low_id)
+{
+	int x;
+	int masters = 0;
+	int ret = 0;
+
+	*low_id = ctx->qc_my_id;
+
+	for (x = 0; x < max; x++) {
+
+		/* See if this one's a master */
+		if (ni[x].ni_state >= S_RUN &&
+		    ni[x].ni_status.ps_state == S_MASTER) {
+			if (!ret)
+				ret = ni[x].ni_status.ps_nodeid;
+			++masters;
+		}
+
+		/* See if it's us... */
+		if (ni[x].ni_status.ps_nodeid == ctx->qc_my_id &&
+		    ni[x].ni_status.ps_state == S_MASTER) {
+			if (!ret)
+				ret = ctx->qc_my_id;
+			++masters;
+			continue;
+		}
+
+		/* Look for dead master */
+		if (ni[x].ni_state < S_RUN &&
+		    ni[x].ni_status.ps_state == S_MASTER) {
+			clulog(LOG_DEBUG,
+			       "Node %d is marked master, but is dead.\n",
+			       ni[x].ni_status.ps_nodeid);
+			continue;
+		}
+
+		if (ni[x].ni_state < S_RUN)
+			continue;
+		
+		if (ni[x].ni_status.ps_nodeid < *low_id)
+			*low_id = ni[x].ni_status.ps_nodeid;
+	}
+
+	if (masters > 1) {
+		clulog(LOG_CRIT,
+		       "Critical Error: More than one master found!\n");
+		/* XXX Handle this how? */
+	}
+	/*
+ 	else if (masters == 1) {
+		printf("Node %d is the master\n", ret);
+	} else {
+		printf("No master found; node %d should be the master\n",
+		       *low_id);
+	}
+	*/
+
+	return ret;
+}
+
+
+/**
+  initialize node information blocks and wait to see if there is already
+  a cluster running using this QD.  Note that this will delay master
+  election if multiple nodes start with a second or two of each other.
+ */
+int
+quorum_init(qd_ctx *ctx, node_info_t *ni, int max, struct h_data *h, int maxh)
+{
+	int x = 0, score, maxscore;
+
+	clulog(LOG_INFO, "Quorum Daemon Initializing\n");
+
+	if (qdisk_validate(ctx->qc_device) < 0)
+		return -1;
+
+	ctx->qc_fd = qdisk_open(ctx->qc_device);
+	if (ctx->qc_fd < 0) {
+		clulog(LOG_CRIT, "Failed to open %s: %s\n", ctx->qc_device,
+		       strerror(errno));
+		return -1;
+	}
+	
+	start_score_thread(h, maxh);
+
+	node_info_init(ni, max);
+	if (qd_write_status(ctx, ctx->qc_my_id,
+			    S_INIT, NULL, NULL, NULL) != 0) {
+		clulog(LOG_CRIT, "Could not initialize status block!\n");
+		return -1;
+	}
+
+	while (++x <= ctx->qc_tko) {
+		read_node_blocks(ctx, ni, max);
+		check_transitions(ctx, ni, max, NULL);
+
+		if (qd_write_status(ctx, ctx->qc_my_id,
+				    S_INIT, NULL, NULL, NULL) != 0) {
+			clulog(LOG_CRIT, "Initialization failed\n");
+			return -1;
+		}
+
+		sleep(ctx->qc_interval);
+
+	}
+
+	get_my_score(&score,&maxscore);
+	clulog(LOG_INFO, "Initial score %d/%d\n", score, maxscore);
+	clulog(LOG_INFO, "Initialization complete\n");
+
+	return 0;
+}
+
+
+/**
+  Vote for a master if it puts a bid in.
+ */
+void
+do_vote(qd_ctx *ctx, node_info_t *ni, int max, disk_msg_t *msg)
+{
+	int x;
+
+	for (x = 0; x < max; x++) {
+		if (ni[x].ni_state != S_RUN)
+			continue;
+
+		if (ni[x].ni_status.ps_msg == M_BID &&
+		    ni[x].ni_status.ps_nodeid < ctx->qc_my_id) {
+
+			/* Vote for lowest bidding ID that is lower
+			   than us */
+			msg->m_msg = M_ACK;
+			msg->m_arg = ni[x].ni_status.ps_nodeid;
+			msg->m_seq = ni[x].ni_status.ps_seq;
+
+			return;
+		}
+	}
+}
+
+
+/*
+  Check to match nodes in mask with nodes online according to CMAN.
+  Only the master needs to do this.
+ */
+void
+check_cman(qd_ctx *ctx, memb_mask_t mask, memb_mask_t master_mask)
+{
+	cman_node_t nodes[MAX_NODES_DISK];
+	int retnodes, x;
+
+	if (cman_get_nodes(ctx->qc_ch, MAX_NODES_DISK,
+			   &retnodes, nodes) <0 )
+		return;
+
+	memset(master_mask, 0, sizeof(master_mask));
+
+	for (x = 0; x < retnodes; x++) {
+		if (is_bit_set(mask, nodes[x].cn_nodeid-1, sizeof(mask)) &&
+		    nodes[x].cn_member)
+			set_bit(master_mask, nodes[x].cn_nodeid-1,
+				sizeof(master_mask));
+	}
+}
+
+
+/* 
+   returns:
+	3: all acks received - you are the master.
+	2: nacked (not highest score?) might not happen
+	1: other node with lower ID is bidding and we should rescind our
+	   bid.
+	0: still waiting; don't clear bid; just wait another round.
+   Modifies:
+	*msg - it will store the vote for the lowest bid if we should
+	clear our bid.
+ */ 
+int
+check_votes(qd_ctx *ctx, node_info_t *ni, int max, disk_msg_t *msg)
+{
+	int x, running = 0, acks = 0, nacks = 0, low_id = ctx->qc_my_id;
+
+	for (x = 0; x < max; x++) {
+		if (state_run(ni[x].ni_state))
+			++running;
+		else
+			continue;
+
+		if (ni[x].ni_status.ps_msg == M_ACK &&
+		    ni[x].ni_status.ps_arg == ctx->qc_my_id) {
+			++acks;
+		}
+
+		if (ni[x].ni_status.ps_msg == M_NACK &&
+		    ni[x].ni_status.ps_arg == ctx->qc_my_id) {
+			++nacks;
+		}
+		
+		/* If there's someone with a lower ID who is also
+		   bidding for master, change our message to vote
+		   for the lowest bidding node ID */
+		if (ni[x].ni_status.ps_msg == M_BID && 
+		    ni[x].ni_status.ps_nodeid < low_id) {
+			low_id = ni[x].ni_status.ps_nodeid;
+			msg->m_msg = M_ACK;
+			msg->m_arg = ni[x].ni_status.ps_nodeid;
+			msg->m_seq = ni[x].ni_status.ps_seq;
+		}
+	}
+
+	if (acks == running)
+		return 3;
+	if (nacks)
+		return 2;
+	if (low_id != ctx->qc_my_id)
+		return 1;
+	return 0;
+}
+
+
+char *
+state_str(disk_node_state_t s)
+{
+	switch (s) {
+	case S_NONE:
+		return "None";
+	case S_EVICT:
+		return "Evicted";
+	case S_INIT:
+		return "Initializing";
+	case S_RUN:
+		return "Running";
+	case S_MASTER:
+		return "Master";
+	default:
+		return "ILLEGAL";
+	}
+}
+
+
+void
+update_local_status(qd_ctx *ctx, node_info_t *ni, int max, int score,
+		    int score_req, int score_max)
+{
+	FILE *fp;
+	int x, need_close = 0;
+
+	if (!ctx->qc_status_file)
+		return;
+
+	if (strcmp(ctx->qc_status_file, "-") == 0) {
+		fp = stdout;
+	} else {
+		fp = fopen(ctx->qc_status_file, "w+");
+		if (fp == NULL)
+			return;
+		need_close = 1;
+	}
+
+	fprintf(fp, "Node ID: %d\n", ctx->qc_my_id);
+	fprintf(fp, "Score (current / min req. / max allowed): %d / %d / %d\n",
+		score, score_req, score_max);
+	fprintf(fp, "Current state: %s\n", state_str(ctx->qc_status));
+	fprintf(fp, "Current disk state: %s\n",
+		state_str(ctx->qc_disk_status));
+
+	fprintf(fp, "Visible Set: {");
+	for (x=0; x<max; x++) {
+		if (ni[x].ni_state >= S_RUN || ni[x].ni_status.ps_nodeid == 
+		    ctx->qc_my_id)
+			fprintf(fp," %d", ni[x].ni_status.ps_nodeid);
+	}
+	fprintf(fp, " }\n");
+
+	if (!ctx->qc_master) {
+		fprintf(fp, "No master node\n");
+		goto out;
+	}
+
+	fprintf(fp, "Master Node ID: %d\n", ctx->qc_master);
+	fprintf(fp, "Quorate Set: {");
+	for (x=0; x<max; x++) {
+		if (is_bit_set(ni[ctx->qc_master-1].ni_status.ps_master_mask,
+			       ni[x].ni_status.ps_nodeid-1,
+			       sizeof(memb_mask_t))) {
+			fprintf(fp," %d", ni[x].ni_status.ps_nodeid);
+		}
+	}
+
+	fprintf(fp, " }\n");
+
+out:
+	fprintf(fp, "\n");
+	if (need_close)
+		fclose(fp);
+}
+
+
+
+int
+quorum_loop(qd_ctx *ctx, node_info_t *ni, int max)
+{
+	disk_msg_t msg = {0, 0, 0};
+	int low_id, bid_pending = 0, score, score_max, score_req;
+	memb_mask_t mask, master_mask;
+
+	ctx->qc_status = S_RUN;
+	
+	_running = 1;
+	while (_running) {
+		/* Read everyone else's status */
+		read_node_blocks(ctx, ni, max);
+
+		/* Check for node transitions */
+		check_transitions(ctx, ni, max, mask);
+
+		/* Check heuristics and remove ourself if necessary */
+		get_my_score(&score, &score_max);
+
+		score_req = ctx->qc_scoremin;
+		if (score_req <= 0)
+			score_req = ((score_max + 1) / 2);
+
+		if (score < score_req) {
+			clear_bit(mask, (ctx->qc_my_id-1), sizeof(mask));
+			if (ctx->qc_status > S_NONE) {
+				clulog(LOG_NOTICE,
+				       "Score insufficient for master "
+				       "operation (%d/%d; max=%d); "
+				       "downgrading\n",
+				       score, score_req, score_max);
+				ctx->qc_status = S_NONE;
+				msg.m_msg = M_NONE;
+				++msg.m_seq;
+				bid_pending = 0;
+				cman_poll_quorum_device(ctx->qc_ch, 0);
+				/* reboot??? */
+			}
+		}  else {
+			set_bit(mask, (ctx->qc_my_id-1), sizeof(mask));
+			if (ctx->qc_status == S_NONE) {
+				clulog(LOG_NOTICE,
+				       "Score sufficient for master "
+				       "operation (%d/%d; max=%d); "
+				       "upgrading\n",
+				       score, score_req, score_max);
+				ctx->qc_status = S_RUN;
+			}
+		}
+
+		/* Find master */
+		ctx->qc_master = master_exists(ctx, ni, max, &low_id);
+
+		/* Figure out what to do based on what we know */
+		if (!ctx->qc_master &&
+		    low_id == ctx->qc_my_id &&
+		    ctx->qc_status == S_RUN &&
+		    !bid_pending ) {
+			/*
+			   If there's no master, and we are the lowest node
+			   ID, make a bid to become master if we're not 
+			   already bidding.
+			 */
+
+			clulog(LOG_DEBUG,"Making bid for master\n");
+			msg.m_msg = M_BID;
+			++msg.m_seq;
+			bid_pending = 1;
+
+		} else if (!ctx->qc_master && !bid_pending) {
+
+			/* We're not the master, and we do not have a bid
+			   pending.  Check for voting on other nodes. */
+			do_vote(ctx, ni, max, &msg);
+		} else if (!ctx->qc_master && bid_pending) {
+
+			/* We're currently bidding for master.
+			   See if anyone's voted, or if we should
+			   rescind our bid */
+
+			/* Yes, those are all deliberate fallthroughs */
+			switch (check_votes(ctx, ni, max, &msg)) {
+			case 3:
+				clulog(LOG_INFO,
+				       "Assuming master role\n");
+				ctx->qc_status = S_MASTER;
+			case 2:
+				msg.m_msg = M_NONE;
+			case 1:
+				bid_pending = 0;
+			default:
+				break;
+			}
+		} else if (ctx->qc_status == S_MASTER &&
+			   ctx->qc_master != ctx->qc_my_id) {
+			
+			/* We think we're master, but someone else claims
+			   that they are master. */
+
+			clulog(LOG_CRIT,
+			       "A master exists, but it's not me?!\n");
+			/* XXX Handle this how? Should not happen*/
+			/* reboot(RB_AUTOBOOT); */
+
+		} else if (ctx->qc_status == S_MASTER &&
+			   ctx->qc_master == ctx->qc_my_id) {
+
+			/* We are the master.  Poll the quorum device.
+			   We can't be the master unless we score high
+			   enough on our heuristics. */
+			check_cman(ctx, mask, master_mask);
+			cman_poll_quorum_device(ctx->qc_ch, 1);
+
+		} else if (ctx->qc_status == S_RUN && ctx->qc_master &&
+			   ctx->qc_master != ctx->qc_my_id) {
+
+			/* We're not the master, but a master exists
+			   Check to see if the master thinks we are 
+			   online.  If we are, tell CMAN so. */
+			if (is_bit_set(
+			      ni[ctx->qc_master-1].ni_status.ps_master_mask,
+				       ctx->qc_my_id-1,
+				       sizeof(memb_mask_t))) {
+				cman_poll_quorum_device(ctx->qc_ch, 1);
+			}
+		}
+		
+		/* Write out our status */
+		if (qd_write_status(ctx, ctx->qc_my_id, ctx->qc_status,
+				    &msg, mask, master_mask) != 0) {
+			clulog(LOG_ERR, "Error writing to quorum disk\n");
+		}
+
+		/* write out our local status */
+		update_local_status(ctx, ni, max, score, score_req, score_max);
+
+		/* Cycle. We could time the loop and sleep
+		   usleep(interval-looptime), but this is fine for now.*/
+		if (_running)
+			sleep(ctx->qc_interval);
+	}
+
+	return 0;
+}
+
+
+/**
+  Tell the other nodes we're done (safely!).
+ */
+int
+quorum_logout(qd_ctx *ctx)
+{
+	/* Write out our status */
+	if (qd_write_status(ctx, ctx->qc_my_id, S_NONE,
+			    NULL, NULL, NULL) != 0) {
+		clulog(LOG_WARNING,
+		       "Error writing to quorum disk during logout\n");
+	}
+	return 0;
+}
+
+
+/**
+  Grab all our configuration data from CCSD
+ */
+int
+get_config_data(char *cluster_name, qd_ctx *ctx, struct h_data *h, int maxh,
+		int *cfh, int debug)
+{
+	int ccsfd = -1, loglevel = 4;
+	char query[256];
+	char *val;
+
+	clulog(LOG_DEBUG, "Loading configuration information\n");
+
+	ccsfd = ccs_force_connect(cluster_name, 1);
+	if (ccsfd < 0) {
+		clulog(LOG_CRIT, "Connection to CCSD failed; cannot start\n");
+		return -1;
+	}
+
+	ctx->qc_interval = 1;
+	ctx->qc_tko = 10;
+	ctx->qc_scoremin = 0;
+
+	/* Get log log_facility */
+	snprintf(query, sizeof(query), "/cluster/quorumd/@log_facility");
+	if (ccs_get(ccsfd, query, &val) == 0) {
+		clu_set_facility(val);
+		free(val);
+	}
+
+	/* Get log level */
+	snprintf(query, sizeof(query), "/cluster/quorumd/@log_level");
+	if (ccs_get(ccsfd, query, &val) == 0) {
+		loglevel = atoi(val);
+		free(val);
+		if (loglevel < 0)
+			loglevel = 4;
+
+		if (!debug)
+			clu_set_loglevel(loglevel);
+	}
+
+	/* Get interval */
+	snprintf(query, sizeof(query), "/cluster/quorumd/@interval");
+	if (ccs_get(ccsfd, query, &val) == 0) {
+		ctx->qc_interval = atoi(val);
+		free(val);
+		if (ctx->qc_interval < 1)
+			ctx->qc_interval = 1;
+	}
+		
+	/* Get tko */
+	snprintf(query, sizeof(query), "/cluster/quorumd/@tko");
+	if (ccs_get(ccsfd, query, &val) == 0) {
+		ctx->qc_tko = atoi(val);
+		free(val);
+		if (ctx->qc_tko < 3)
+			ctx->qc_tko = 3;
+	}
+		
+	/* Get votes */
+	snprintf(query, sizeof(query), "/cluster/quorumd/@votes");
+	if (ccs_get(ccsfd, query, &val) == 0) {
+		ctx->qc_votes = atoi(val);
+		free(val);
+		if (ctx->qc_votes < 0)
+			ctx->qc_votes = 0;
+	}
+
+	/* Get device */
+	snprintf(query, sizeof(query), "/cluster/quorumd/@device");
+	if (ccs_get(ccsfd, query, &val) == 0) {
+		ctx->qc_device = val;
+	}
+
+	/* Get label (overrides device) */
+	snprintf(query, sizeof(query), "/cluster/quorumd/@label");
+	if (ccs_get(ccsfd, query, &val) == 0) {
+		ctx->qc_label = val;
+	}
+
+	/* Get status file */
+	snprintf(query, sizeof(query), "/cluster/quorumd/@status_file");
+	if (ccs_get(ccsfd, query, &val) == 0) {
+		ctx->qc_status_file = val;
+	}
+
+	/* Get min score */
+	snprintf(query, sizeof(query), "/cluster/quorumd/@min_score");
+	if (ccs_get(ccsfd, query, &val) == 0) {
+		ctx->qc_scoremin = atoi(val);
+		free(val);
+		if (ctx->qc_scoremin < 0)
+			ctx->qc_scoremin = 0;
+	}
+
+	*cfh = configure_heuristics(ccsfd, h, maxh);
+
+	clulog(LOG_DEBUG,
+	       "Quorum Daemon: %d heuristics, %d interval, %d tko, %d votes\n",
+	       *cfh, ctx->qc_interval, ctx->qc_tko, ctx->qc_votes);
+
+	ccs_disconnect(ccsfd);
+
+	return 0;
+}
+
+
+int
+main(int argc, char **argv)
+{
+	cman_node_t me;
+	int cfh, rv;
+	qd_ctx ctx;
+	cman_handle_t ch;
+	node_info_t ni[MAX_NODES_DISK];
+	struct h_data h[10];
+	char debug = 0, foreground = 0;
+	char device[128];
+
+	while ((rv = getopt(argc, argv, "fd")) != EOF) {
+		switch (rv) {
+		case 'd':
+			debug = 1;
+			break;
+		case 'f':
+			foreground = 1;
+		default:
+			break;
+		}
+	}
+#if (defined(LIBCMAN_VERSION) && LIBCMAN_VERSION >= 2)
+	ch = cman_admin_init(NULL);
+#else
+	ch = cman_init(NULL);
+#endif
+	if (!ch) {
+		printf("Could not connect to cluster (CMAN not running?)\n");
+		return -1;
+	}
+
+	if (cman_get_node(ch, CMAN_NODEID_US, &me) < 0) {
+		printf("Could not determine local node ID; cannot start\n");
+		return -1;
+	}
+
+	qd_init(&ctx, ch, me.cn_nodeid);
+
+	signal(SIGINT, int_handler);
+
+        if (debug)
+                clu_set_loglevel(LOG_DEBUG);
+        if (foreground)
+                clu_log_console(1);
+		
+	if (get_config_data(NULL, &ctx, h, 10, &cfh, debug) < 0) {
+		clulog_and_print(LOG_CRIT, "Configuration failed\n");
+		return -1;
+	}
+
+	if (ctx.qc_label) {
+		if (find_partitions("/proc/partitions",
+				    ctx.qc_label, device,
+				    sizeof(device), 0) != 0) {
+			clulog_and_print(LOG_CRIT, "Unable to match label"
+					 " '%s' to any device\n",
+					 ctx.qc_label);
+			return -1;
+		}
+
+		if (ctx.qc_device)
+			free(ctx.qc_device);
+
+		ctx.qc_device = strdup(device);
+
+		clulog(LOG_INFO, "Quorum Partition: %s Label: %s\n",
+		       ctx.qc_device, ctx.qc_label);
+	} else if (ctx.qc_device) {
+		if (check_device(ctx.qc_device, NULL, NULL) != 0) {
+			clulog(LOG_CRIT,
+			       "Specified partition %s does not have a "
+			       "qdisk label\n", ctx.qc_device);
+			return -1;
+		}
+	}
+
+	if (!foreground)
+                daemon(0,0);
+
+	if (quorum_init(&ctx, ni, MAX_NODES_DISK, h, cfh) < 0) {
+		clulog_and_print(LOG_CRIT, "Initialization failed\n");
+		return -1;
+	}
+	
+	cman_register_quorum_device(ctx.qc_ch, ctx.qc_device, ctx.qc_votes);
+	/*
+		XXX this always returns -1 / EBUSY even when it works?!!!
+		
+	if ((rv = cman_register_quorum_device(ctx.qc_ch, ctx.qc_device,
+					      ctx.qc_votes)) < 0) {
+		clulog_and_print(LOG_CRIT,
+				 "Could not register %s with CMAN; "
+				 "return = %d; error = %s\n",
+				 ctx.qc_device, rv, strerror(errno));
+		return -1;
+	}
+	*/
+
+	quorum_loop(&ctx, ni, MAX_NODES_DISK);
+	cman_unregister_quorum_device(ctx.qc_ch);
+
+	quorum_logout(&ctx);
+
+	qd_destroy(&ctx);
+
+	return 0;
+
+}
+
/cvs/cluster/cluster/cman/qdisk/mkqdisk.c,v  -->  standard output
revision 1.3.2.1
--- cluster/cman/qdisk/mkqdisk.c
+++ -	2006-07-21 18:01:40.340826000 +0000
@@ -0,0 +1,93 @@
+/**
+  Copyright Red Hat, Inc. 2006
+
+  This program is free software; you can redistribute it and/or modify it
+  under the terms of the GNU General Public License as published by the
+  Free Software Foundation; either version 2, or (at your option) any
+  later version.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; see the file COPYING.  If not, write to the
+  Free Software Foundation, Inc.,  675 Mass Ave, Cambridge, 
+  MA 02139, USA.
+
+  Author: Lon Hohberger <lhh at redhat.com>
+ */
+/**
+  @file Quorum disk utility
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <disk.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <platform.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+
+int
+main(int argc, char **argv)
+{
+	char device[128];
+	char *newdev = NULL, *newlabel = NULL;
+	int rv;
+
+	printf("mkqdisk v0.5\n");
+
+	while ((rv = getopt(argc, argv, "Lfc:l:h")) != EOF) {
+		switch (rv) {
+		case 'L':
+			/* List */
+			close(2);
+			return find_partitions("/proc/partitions",
+					       NULL, NULL, 0, 1);
+			break;
+		case 'f':
+			close(2);
+			return find_partitions("/proc/partitions",
+					       optarg, device,
+					       sizeof(device), 0);
+		case 'c':
+			newdev = optarg;
+			break;
+		case 'l':
+			newlabel = optarg;
+			break;
+		case 'h':
+			printf("usage: mkqdisk -L | -f <label> | -c "
+			       "<device> -l <label>\n");
+			return 0;
+		default:
+			break;
+		}
+	}
+
+	if (!newdev && !newlabel) {
+		printf("usage: mkqdisk -L | -f <label> | -c "
+		       "<device> -l <label>\n");
+		return 1;
+	}
+
+	if (!newdev || !newlabel) {
+		printf("Both a device and a label are required\n");
+		return 1;
+	}
+
+	printf("Writing new quorum disk label '%s' to %s.\n",
+	       newlabel, newdev);
+	printf("WARNING: About to destroy all data on %s; proceed [N/y] ? ",
+	       newdev);
+	if (getc(stdin) != 'y') {
+		printf("Good thinking.\n");
+		return 0;
+	}
+
+	return qdisk_init(newdev, newlabel);
+}
/cvs/cluster/cluster/cman/qdisk/platform.h,v  -->  standard output
revision 1.2.2.1
--- cluster/cman/qdisk/platform.h
+++ -	2006-07-21 18:01:40.435441000 +0000
@@ -0,0 +1,59 @@
+/*
+  Copyright Red Hat, Inc. 2002-2003
+
+  The Red Hat Cluster Manager API Library is free software; you can
+  redistribute it and/or modify it under the terms of the GNU Lesser
+  General Public License as published by the Free Software Foundation;
+  either version 2.1 of the License, or (at your option) any later
+  version.
+
+  The Red Hat Cluster Manager API Library is distributed in the hope
+  that it will be useful, but WITHOUT ANY WARRANTY; without even the
+  implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+  PURPOSE.  See the GNU Lesser General Public License for more details.
+
+  You should have received a copy of the GNU Lesser General Public
+  License along with this library; if not, write to the Free Software
+  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+  USA. 
+ */
+/** @file
+ * Defines for byte-swapping
+ */
+#ifndef __PLATFORM_H
+#define __PLATFORM_H
+
+#include <endian.h>
+#include <sys/param.h>
+#include <byteswap.h>
+#include <bits/wordsize.h>
+
+/* No swapping on little-endian machines */
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+#define le_swap16(x) (x)
+#define le_swap32(x) (x)
+#define le_swap64(x) (x)
+#else
+#define le_swap16(x) bswap_16(x)
+#define le_swap32(x) bswap_32(x)
+#define le_swap64(x) bswap_64(x)
+#endif
+
+/* No swapping on big-endian machines */
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+#define be_swap16(x) bswap_16(x)
+#define be_swap32(x) bswap_32(x)
+#define be_swap64(x) bswap_64(x)
+#else
+#define be_swap16(x) (x)
+#define be_swap32(x) (x)
+#define be_swap64(x) (x)
+#endif
+
+
+#define swab16(x) x=be_swap16(x)
+#define swab32(x) x=be_swap32(x)
+#define swab64(x) x=be_swap64(x)
+
+
+#endif /* __PLATFORM_H */
/cvs/cluster/cluster/cman/qdisk/proc.c,v  -->  standard output
revision 1.2.2.1
--- cluster/cman/qdisk/proc.c
+++ -	2006-07-21 18:01:40.521042000 +0000
@@ -0,0 +1,128 @@
+/**
+  Copyright Red Hat, Inc. 2006
+
+  This program is free software; you can redistribute it and/or modify it
+  under the terms of the GNU General Public License as published by the
+  Free Software Foundation; either version 2, or (at your option) any
+  later version.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; see the file COPYING.  If not, write to the
+  Free Software Foundation, Inc.,  675 Mass Ave, Cambridge, 
+  MA 02139, USA.
+
+  Author: Lon Hohberger <lhh at redhat.com>
+ */
+/**
+  @file Quorum disk /proc/partition scanning functions
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <disk.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <platform.h>
+#include <stdlib.h>
+#include <string.h>
+
+
+int
+check_device(char *device, char *label, quorum_header_t *qh)
+{
+	int fd = -1, ret = -1;
+	quorum_header_t qh_local;
+
+	if (!qh)
+		qh = &qh_local;
+
+	fd = qdisk_validate(device);
+	if (fd < 0) {
+		perror("qdisk_verify");
+		return -1;
+	}
+
+	fd = qdisk_open(device);
+	if (fd < 0) {
+		perror("qdisk_open");
+		return -1;
+	}
+
+	if (qdisk_read(fd, OFFSET_HEADER, qh, sizeof(*qh)) == sizeof(*qh)) {
+		swab_quorum_header_t(qh);
+                if (qh->qh_magic == HEADER_MAGIC_NUMBER) {
+			if (!label || !strcmp(qh->qh_cluster, label)) {
+				ret = 0;
+			}
+                }
+        }
+
+	qdisk_close(&fd);
+
+	return ret;
+}
+
+
+int
+find_partitions(const char *partfile, const char *label,
+	        char *devname, size_t devlen, int print)
+{
+	char line[4096];
+	FILE *fp;
+	int minor, major;
+	unsigned long long blkcnt;
+	char device[128];
+	char realdev[256];
+	quorum_header_t qh;
+
+	fp = fopen(partfile, "r");
+	if (!fp)
+		return -1;
+
+	while (fgets(line, sizeof(line), fp) != NULL) {
+		if (strlen(line) > 128 + (22) /* 5 + 5 + 11 + 1 */) {
+			/*printf("Line too long!\n");*/
+			continue;
+		}
+
+		/* This line is taken from 2.6.15.4's proc line */
+		sscanf(line, "%4d %4d %10llu %s", &major, &minor,
+		       &blkcnt, device);
+
+		if (strlen(device)) {
+			snprintf(realdev, sizeof(realdev),
+				 "/dev/%s", device);
+			if (check_device(realdev, (char *)label, &qh) != 0)
+				continue;
+
+			if (print) {
+				printf("%s:\n", realdev);
+				printf("\tMagic:   %08x\n", qh.qh_magic);
+				printf("\tLabel:   %s\n", qh.qh_cluster);
+				printf("\tCreated: %s",
+				       ctime((time_t *)&qh.qh_timestamp));
+				printf("\tHost:    %s\n\n", qh.qh_updatehost);
+			}
+
+			if (devname && devlen) {
+				/* Got it */
+				strncpy(devname, realdev, devlen);
+				fclose(fp);
+				return 0;
+			}
+		}
+	}
+
+	fclose(fp);
+
+	if (print)
+		/* No errors if we're just printing stuff */
+		return 0;
+
+	errno = ENOENT;
+	return -1;
+}
/cvs/cluster/cluster/cman/qdisk/score.c,v  -->  standard output
revision 1.2.2.1
--- cluster/cman/qdisk/score.c
+++ -	2006-07-21 18:01:40.622758000 +0000
@@ -0,0 +1,383 @@
+/**
+  Copyright Red Hat, Inc. 2006
+
+  This program is free software; you can redistribute it and/or modify it
+  under the terms of the GNU General Public License as published by the
+  Free Software Foundation; either version 2, or (at your option) any
+  later version.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; see the file COPYING.  If not, write to the
+  Free Software Foundation, Inc.,  675 Mass Ave, Cambridge, 
+  MA 02139, USA.
+
+  Author: Lon Hohberger <lhh at redhat.com>
+ */
+/**
+  @file Quorum daemon scoring functions + thread.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <fcntl.h>
+#include <pthread.h>
+#include <string.h>
+#include <ccs.h>
+#include <clulog.h>
+#include "score.h"
+
+static pthread_mutex_t sc_lock = PTHREAD_MUTEX_INITIALIZER;
+static int _score = 0, _maxscore = 0, _score_thread_running = 0;
+static pthread_t score_thread = (pthread_t)0;
+
+struct h_arg {
+	struct h_data *h;
+	int count;
+};
+
+
+/*
+  XXX Messy, but works for now... 
+ */
+void
+nullify(void)
+{
+	int fd[3];
+
+	close(0);
+	close(1);
+	close(2);
+
+	fd[0] = open("/dev/null", O_RDONLY);
+	if (fd[0] != 0)
+		dup2(fd[0], 0);
+	fd[1] = open("/dev/null", O_WRONLY);
+	if (fd[1] != 1)
+		dup2(fd[1], 1);
+	fd[2] = open("/dev/null", O_WRONLY);
+	if (fd[2] != 2)
+		dup2(fd[2], 2);
+}
+
+
+/**
+  Spin off a user-defined heuristic
+ */
+static int
+fork_heuristic(struct h_data *h)
+{
+	int pid;
+	char *argv[4];
+	time_t now;
+
+	if (h->childpid) {	
+		errno = EINPROGRESS;
+		return -1;
+	}
+
+	now = time(NULL);
+	if (now < h->nextrun)
+		return 0;
+
+	h->nextrun = now + h->interval;
+
+	pid = fork();
+	if (pid < 0)
+		return -1;
+
+	if (pid) {
+		h->childpid = pid;
+		return 0;
+	}
+
+	argv[0] = "/bin/sh";
+	argv[1] = "-c";
+	argv[2] = h->program;
+	argv[3] = NULL;
+
+	nullify();
+
+	execv("/bin/sh", argv);
+
+	printf("Execv failed\n");
+	return 0;
+}
+
+
+/**
+  Total our current score
+ */
+static void
+total_score(struct h_data *h, int max, int *score, int *maxscore)
+{
+	int x;
+
+	*score = 0;
+	*maxscore = 0;
+
+	for (x = 0; x < max; x++) {
+		*maxscore += h[x].score;
+		if (h[x].available)
+			*score += h[x].score;
+	}
+}
+
+
+/**
+  Check for response from a user-defined heuristic / script
+ */
+static int
+check_heuristic(struct h_data *h, int block)
+{
+	int ret;
+	int status;
+
+	if (h->childpid == 0)
+		return 0;
+
+	ret = waitpid(h->childpid, &status, block?0:WNOHANG);
+	if (!block && ret == 0)
+		return 0;
+
+	h->childpid = 0;
+	h->available = 0;
+	if (ret < 0 && errno == ECHILD)
+		return -1;
+	if (!WIFEXITED(status))
+		return 0;
+	if (WEXITSTATUS(status) != 0)
+		return 0;
+	h->available = 1;
+	return 0;
+}
+
+
+/**
+  Kick off all available heuristics
+ */
+static int
+fork_heuristics(struct h_data *h, int max)
+{
+	int x;
+
+	for (x = 0; x < max; x++)
+		fork_heuristic(&h[x]);
+	return 0;
+}
+
+
+/**
+  Check all available heuristics
+ */
+static int
+check_heuristics(struct h_data *h, int max, int block)
+{
+	int x;
+
+	for (x = 0; x < max; x++)
+		check_heuristic(&h[x], block);
+	return 0;
+}
+
+
+/**
+  Read configuration data from CCS into the array provided
+ */
+int
+configure_heuristics(int ccsfd, struct h_data *h, int max)
+{
+	int x = 0;
+	char *val;
+	char query[128];
+
+	if (!h || !max)
+		return -1;
+
+	do {
+		h[x].program = NULL;
+		h[x].available = 0;
+		h[x].interval = 2;
+		h[x].score = 1;
+		h[x].childpid = 0;
+		h[x].nextrun = 0;
+
+		/* Get program */
+		snprintf(query, sizeof(query),
+			 "/cluster/quorumd/heuristic[%d]/@program", x+1);
+		if (ccs_get(ccsfd, query, &val) != 0)
+			/* No more */
+			break;
+		h[x].program = val;
+
+		/* Get score */
+		snprintf(query, sizeof(query),
+			 "/cluster/quorumd/heuristic[%d]/@score", x+1);
+		if (ccs_get(ccsfd, query, &val) == 0) {
+			h[x].score = atoi(val);
+			free(val);
+			if (h[x].score <= 0)
+				h[x].score = 1;
+		}
+		
+		/* Get query interval */
+		snprintf(query, sizeof(query),
+			 "/cluster/quorumd/heuristic[%d]/@interval", x+1);
+		if (ccs_get(ccsfd, query, &val) == 0) {
+			h[x].interval = atoi(val);
+			free(val);
+			if (h[x].interval <= 0)
+				h[x].interval = 2;
+		}
+
+		clulog(LOG_DEBUG, "Heuristic: '%s' score=%d interval=%d\n",
+		       h[x].program, h[x].score, h[x].interval);
+
+	} while (++x < max);
+
+	clulog(LOG_DEBUG, "%d heuristics loaded\n", x);
+		
+	return x;
+}
+
+
+/**
+  Return the current score + maxscore to the caller
+ */
+int
+get_my_score(int *score, int *maxscore)
+{
+	pthread_mutex_lock(&sc_lock);
+	*score = _score;
+	*maxscore = _maxscore;
+	pthread_mutex_unlock(&sc_lock);
+
+	return 0;
+}
+
+
+/**
+  Loop for the scoring thread.
+ */
+void *
+score_thread_main(void *arg)
+{
+	struct h_arg *args = (struct h_arg *)arg;
+	int score, maxscore;
+
+	while (_score_thread_running) {
+		fork_heuristics(args->h, args->count);
+		check_heuristics(args->h, args->count, 0);
+		total_score(args->h, args->count, &score, &maxscore);
+
+		pthread_mutex_lock(&sc_lock);
+		_score = score;
+		_maxscore = maxscore;
+		pthread_mutex_unlock(&sc_lock);
+
+		if (_score_thread_running)
+			sleep(1);
+	}
+
+	free(args->h);
+	free(args);
+	printf("Score thread going away\n");
+	return (NULL);
+}
+
+
+/**
+  Stop the score thread for shutdown / reconfiguration
+ */
+int
+stop_score_thread(void)
+{
+	void *ret;
+
+	if (!_score_thread_running)
+		return 0;
+
+	_score_thread_running = 0;
+	pthread_join(score_thread, &ret);
+
+	return 0;
+}
+
+
+/**
+  Start the score thread.  h is copied into an argument which is
+  passed in as the arg parameter in the score thread, so it is safe
+  to pass in h if it was allocated on the stack.
+ */
+int
+start_score_thread(struct h_data *h, int count)
+{
+	pthread_attr_t attrs;
+	struct h_arg *args;
+
+	if (!h || !count)
+		return -1;
+
+	args = malloc(sizeof(struct h_arg));
+	if (!args)
+		return -1;
+
+	args->h = malloc(sizeof(struct h_data) * count);
+	if (!args->h) {
+		free(args);
+		return -1;
+	}
+
+	memcpy(args->h, h, (sizeof(struct h_data) * count));
+	args->count = count;
+
+	_score_thread_running = 1;
+        pthread_attr_init(&attrs);
+        pthread_attr_setinheritsched(&attrs, PTHREAD_INHERIT_SCHED);
+        pthread_create(&score_thread, &attrs, score_thread_main, args);
+        pthread_attr_destroy(&attrs);
+
+	if (score_thread)
+		return 0;
+	_score_thread_running = 0;
+	return -1;	
+}
+
+
+#if 0
+int
+main(int argc, char **argv)
+{
+	struct h_data h[10];
+	int max = 0, score, maxscore, ccsfd;
+
+	ccsfd = ccs_force_connect("test", 1);
+	if (ccsfd < 0) 
+		return -1;
+	max = configure_heuristics(ccsfd, h, 10);
+	ccs_disconnect(ccsfd);
+	
+	start_score_thread(h, max);
+	max = 0;
+	while (max < 10) {
+		get_my_score(&score,&maxscore);
+		printf("current %d/%d\n", score, maxscore);
+		sleep(1);
+		++max;
+	}
+	stop_score_thread();
+
+	get_my_score(&score,&maxscore);
+	printf("final! %d/%d\n", score, maxscore);
+
+	return 0;
+}
+#endif
+
/cvs/cluster/cluster/cman/qdisk/score.h,v  -->  standard output
revision 1.2.2.1
--- cluster/cman/qdisk/score.h
+++ -	2006-07-21 18:01:40.712579000 +0000
@@ -0,0 +1,60 @@
+/**
+  Copyright Red Hat, Inc. 2006
+
+  This program is free software; you can redistribute it and/or modify it
+  under the terms of the GNU General Public License as published by the
+  Free Software Foundation; either version 2, or (at your option) any
+  later version.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; see the file COPYING.  If not, write to the
+  Free Software Foundation, Inc.,  675 Mass Ave, Cambridge, 
+  MA 02139, USA.
+
+  Author: Lon Hohberger <lhh at redhat.com>
+ */
+/**
+  @file Quorum daemon scoring functions + thread header file
+ */
+#ifndef _SCORE_H
+#define _SCORE_H
+
+#include <time.h>
+#include <sys/time.h>
+#include <sys/types.h>
+
+struct h_data {
+	char *	program;
+	int	score;
+	int	available;
+	int	interval;
+	pid_t	childpid;
+	time_t	nextrun;
+};
+
+/*
+   Grab score data from CCSD
+ */
+int configure_heuristics(int ccsfd, struct h_data *hp, int max);
+
+/* 
+   Stop the thread which runs the scoring applets.
+ */
+int stop_score_thread(void);
+
+/*
+   Start the thread which runs the scoring applets
+ */
+int start_score_thread(struct h_data *h, int count);
+
+/* 
+   Get our score + maxscore
+ */
+int get_my_score(int *score, int *maxscore);
+
+#endif




More information about the Cluster-devel mailing list