[Cluster-devel] cluster/cman Makefile init.d/Makefile man/Make ...
lhh at sourceware.org
lhh at sourceware.org
Fri Jul 21 18:01:40 UTC 2006
CVSROOT: /cvs/cluster
Module name: cluster
Branch: STABLE
Changes by: lhh at sourceware.org 2006-07-21 18:01:38
Modified files:
cman : Makefile
cman/init.d : Makefile
cman/man : Makefile
Added files:
cman/init.d : qdiskd
cman/man : mkqdisk.8 qdisk.5 qdiskd.8
cman/qdisk : Makefile README bitmap.c clulog.c clulog.h
crc32.c disk.c disk.h disk_util.c gettid.c
gettid.h main.c mkqdisk.c platform.h proc.c
score.c score.h
Log message:
Merge from RHEL4 branch; add QDisk
Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/Makefile.diff?cvsroot=cluster&only_with_tag=STABLE&r1=1.4.8.1&r2=1.4.8.2
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/init.d/qdiskd.diff?cvsroot=cluster&only_with_tag=STABLE&r1=NONE&r2=1.2.2.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/init.d/Makefile.diff?cvsroot=cluster&only_with_tag=STABLE&r1=1.1&r2=1.1.8.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/man/mkqdisk.8.diff?cvsroot=cluster&only_with_tag=STABLE&r1=NONE&r2=1.2.4.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/man/qdisk.5.diff?cvsroot=cluster&only_with_tag=STABLE&r1=NONE&r2=1.2.4.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/man/qdiskd.8.diff?cvsroot=cluster&only_with_tag=STABLE&r1=NONE&r2=1.2.4.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/man/Makefile.diff?cvsroot=cluster&only_with_tag=STABLE&r1=1.1&r2=1.1.8.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/qdisk/Makefile.diff?cvsroot=cluster&only_with_tag=STABLE&r1=NONE&r2=1.5.2.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/qdisk/README.diff?cvsroot=cluster&only_with_tag=STABLE&r1=NONE&r2=1.4.2.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/qdisk/bitmap.c.diff?cvsroot=cluster&only_with_tag=STABLE&r1=NONE&r2=1.2.2.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/qdisk/clulog.c.diff?cvsroot=cluster&only_with_tag=STABLE&r1=NONE&r2=1.2.2.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/qdisk/clulog.h.diff?cvsroot=cluster&only_with_tag=STABLE&r1=NONE&r2=1.2.2.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/qdisk/crc32.c.diff?cvsroot=cluster&only_with_tag=STABLE&r1=NONE&r2=1.2.2.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/qdisk/disk.c.diff?cvsroot=cluster&only_with_tag=STABLE&r1=NONE&r2=1.4.2.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/qdisk/disk.h.diff?cvsroot=cluster&only_with_tag=STABLE&r1=NONE&r2=1.3.2.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/qdisk/disk_util.c.diff?cvsroot=cluster&only_with_tag=STABLE&r1=NONE&r2=1.2.2.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/qdisk/gettid.c.diff?cvsroot=cluster&only_with_tag=STABLE&r1=NONE&r2=1.4.2.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/qdisk/gettid.h.diff?cvsroot=cluster&only_with_tag=STABLE&r1=NONE&r2=1.2.2.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/qdisk/main.c.diff?cvsroot=cluster&only_with_tag=STABLE&r1=NONE&r2=1.3.2.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/qdisk/mkqdisk.c.diff?cvsroot=cluster&only_with_tag=STABLE&r1=NONE&r2=1.3.2.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/qdisk/platform.h.diff?cvsroot=cluster&only_with_tag=STABLE&r1=NONE&r2=1.2.2.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/qdisk/proc.c.diff?cvsroot=cluster&only_with_tag=STABLE&r1=NONE&r2=1.2.2.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/qdisk/score.c.diff?cvsroot=cluster&only_with_tag=STABLE&r1=NONE&r2=1.2.2.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cman/qdisk/score.h.diff?cvsroot=cluster&only_with_tag=STABLE&r1=NONE&r2=1.2.2.1
--- cluster/cman/Makefile 2005/07/05 16:01:29 1.4.8.1
+++ cluster/cman/Makefile 2006/07/21 18:01:37 1.4.8.2
@@ -14,14 +14,17 @@
all:
cd cman_tool && ${MAKE} all
cd lib && ${MAKE} all
+ cd qdisk && ${MAKE} all
copytobin:
cd cman_tool && ${MAKE} copytobin
+ cd qdisk && ${MAKE} copytobin
cd lib && ${MAKE} copytobin
clean:
cd bin && ${MAKE} clean
cd cman_tool && ${MAKE} clean
+ cd qdisk && ${MAKE} clean
cd lib && ${MAKE} clean
distclean: clean
@@ -31,10 +34,12 @@
cd man && ${MAKE} install
cd cman_tool && ${MAKE} install
cd lib && ${MAKE} install
+ cd qdisk && ${MAKE} install
cd init.d && ${MAKE} install
uninstall:
cd cman_tool && ${MAKE} uninstall
cd lib && ${MAKE} uninstall
cd man && ${MAKE} uninstall
+ cd qdisk && ${MAKE} uninstall
cd init.d && ${MAKE} uninstall
/cvs/cluster/cluster/cman/init.d/qdiskd,v --> standard output
revision 1.2.2.1
--- cluster/cman/init.d/qdiskd
+++ - 2006-07-21 18:01:38.720108000 +0000
@@ -0,0 +1,63 @@
+#!/bin/bash
+#
+# chkconfig: 345 22 78
+# description: Starts and stops the quroum disk daemon
+#
+#
+### BEGIN INIT INFO
+# Provides:
+### END INIT INFO
+
+. /etc/init.d/functions
+[ -f /etc/sysconfig/cluster ] && . /etc/sysconfig/cluster
+
+LOCK_FILE="/var/lock/subsys/qdiskd"
+
+rtrn=1
+retries=0
+
+# See how we were called.
+case "$1" in
+ start)
+ action "Starting the Quorum Disk Daemon:" qdiskd
+ rtrn=$?
+ [ $rtrn = 0 ] && touch $LOCK_FILE
+ ;;
+
+ stop)
+ echo -n "Stopping the Quorum Disk Daemon:"
+ killproc qdiskd
+ while [ -n "`pidof qdiskd`" ] && [ $retries -lt 5 ]; do
+ sleep 1
+ killproc qdiskd
+ ((retries++))
+ done
+ if [ -z "`pidof qdiskd`" ]; then
+ echo_success
+ echo
+ rtrn=0
+ rm -f $LOCK_FILE
+ else
+ echo_failure
+ echo
+ rtrn=1
+ fi
+ ;;
+
+ restart)
+ $0 stop || exit $?
+ $0 start
+ rtrn=$?
+ ;;
+
+ status)
+ status qdiskd
+ rtrn=$?
+ ;;
+
+ *)
+ echo $"Usage: $0 {start|stop|restart|status}"
+ ;;
+esac
+
+exit $rtrn
--- cluster/cman/init.d/Makefile 2004/12/17 20:07:59 1.1
+++ cluster/cman/init.d/Makefile 2006/07/21 18:01:38 1.1.8.1
@@ -10,7 +10,7 @@
###############################################################################
###############################################################################
-TARGET= cman
+TARGET= cman qdiskd
UNINSTALL=${top_srcdir}/scripts/uninstall.pl
/cvs/cluster/cluster/cman/man/mkqdisk.8,v --> standard output
revision 1.2.4.1
--- cluster/cman/man/mkqdisk.8
+++ - 2006-07-21 18:01:38.882836000 +0000
@@ -0,0 +1,23 @@
+.TH "mkqdisk" "8" "July 2006" "" "Quorum Disk Management"
+.SH "NAME"
+mkqdisk \- Cluster Quorum Disk Utility
+.SH "WARNING"
+Use of this command can cause the cluster to malfunction.
+.SH "SYNOPSIS"
+\fBmkqdisk [\-?|\-h] | [\-L] | [\-f \fPlabel\fB] [\-c \fPdevice \fB -l \fPlabel\fB]
+.SH "DESCRIPTION"
+.PP
+The \fBmkqdisk\fP command is used to create a new quorum disk or display
+existing quorum disks accessible from a given cluster node.
+.SH "OPTIONS"
+.IP "\-c device \-l label"
+Initialize a new cluster quorum disk. This will destroy all data on the given
+device. If a cluster is currently using that device as a quorum disk, the
+entire cluster will malfunction. Do not ru
+.IP "\-f label"
+Find the cluster quorum disk with the given label and display information about it..
+.IP "\-L"
+Display information on all accessible cluster quorum disks.
+
+.SH "SEE ALSO"
+qdisk(5) qdiskd(8)
/cvs/cluster/cluster/cman/man/qdisk.5,v --> standard output
revision 1.2.4.1
--- cluster/cman/man/qdisk.5
+++ - 2006-07-21 18:01:38.970862000 +0000
@@ -0,0 +1,309 @@
+.TH "QDisk" "8" "July 2006" "" "Cluster Quorum Disk"
+.SH "NAME"
+QDisk 1.0 \- a disk-based quorum daemon for CMAN / Linux-Cluster
+.SH "1. Overview"
+.SH "1.1 Problem"
+In some situations, it may be necessary or desirable to sustain
+a majority node failure of a cluster without introducing the need for
+asymmetric cluster configurations (e.g. client-server, or heavily-weighted
+voting nodes).
+
+.SH "1.2. Design Requirements"
+* Ability to sustain 1..(n-1)/n simultaneous node failures, without the
+danger of a simple network partition causing a split brain. That is, we
+need to be able to ensure that the majority failure case is not merely
+the result of a network partition.
+
+* Ability to use external reasons for deciding which partition is the
+the quorate partition in a partitioned cluster. For example, a user may
+have a service running on one node, and that node must always be the master
+in the event of a network partition. Or, a node might lose all network
+connectivity except the cluster communication path - in which case, a
+user may wish that node to be evicted from the cluster.
+
+* Integration with CMAN. We must not require CMAN to run with us (or
+without us). Linux-Cluster does not require a quorum disk normally -
+introducing new requirements on the base of how Linux-Cluster operates
+is not allowed.
+
+* Data integrity. In order to recover from a majority failure, fencing
+is required. The fencing subsystem is already provided by Linux-Cluster.
+
+* Non-reliance on hardware or protocol specific methods (i.e. SCSI
+reservations). This ensures the quorum disk algorithm can be used on the
+widest range of hardware configurations possible.
+
+* Little or no memory allocation after initialization. In critical paths
+during failover, we do not want to have to worry about being killed during
+a memory pressure situation because we request a page fault, and the Linux
+OOM killer responds...
+
+.SH "1.3. Hardware Considerations and Requirements"
+.SH "1.3.1. Concurrent, Synchronous, Read/Write Access"
+This quorum daemon requires a shared block device with concurrent read/write
+access from all nodes in the cluster. The shared block device can be
+a multi-port SCSI RAID array, a Fiber-Channel RAID SAN, a RAIDed iSCSI
+target, or even GNBD. The quorum daemon uses O_DIRECT to write to the
+device.
+
+.SH "1.3.2. Bargain-basement JBODs need not apply"
+There is a minimum performance requirement inherent when using disk-based
+cluster quorum algorithms, so design your cluster accordingly. Using a
+cheap JBOD with old SCSI2 disks on a multi-initiator bus will cause
+problems at the first load spike. Plan your loads accordingly; a node's
+inability to write to the quorum disk in a timely manner will cause the
+cluster to evict the node. Using host-RAID or multi-initiator parallel
+SCSI configurations with the qdisk daemon is unlikely to work, and will
+probably cause administrators a lot of frustration. That having been
+said, because the timeouts are configurable, most hardware should work
+if the timeouts are set high enough.
+
+.SH "1.3.3. Fencing is Required"
+In order to maintain data integrity under all failure scenarios, use of
+this quorum daemon requires adequate fencing, preferrably power-based
+fencing. Watchdog timers and software-based solutions to reboot the node
+internally, while possibly sufficient, are not considered 'fencing' for
+the purposes of using the quorum disk.
+
+.SH "1.4. Limitations"
+* At this time, this daemon supports a maximum of 16 nodes. This is
+primarily a scalability issue: As we increase the node count, we increase
+the amount of synchronous I/O contention on the shared quorum disk.
+
+* Cluster node IDs must be statically configured in cluster.conf and
+must be numbered from 1..16 (there can be gaps, of course).
+
+* Cluster node votes should be more or less equal.
+
+* CMAN must be running before the qdisk program can start.
+
+* CMAN's eviction timeout should be at least 2x the quorum daemon's
+to give the quorum daemon adequate time to converge on a master during a
+failure + load spike situation.
+
+* The total number of votes assigned to the quorum device should be
+equal to or greater than the total number of node-votes in the cluster.
+While it is possible to assign only one (or a few) votes to the quorum
+device, the effects of doing so have not been explored.
+
+* Currently, the quorum disk daemon is difficult to use with CLVM if
+the quorum disk resides on a CLVM logical volume. CLVM requires a
+quorate cluster to correctly operate, which introduces a chicken-and-egg
+problem for starting the cluster: CLVM needs quorum, but the quorum daemon
+needs CLVM (if and only if the quorum device lies on CLVM-managed storage).
+One way to work around this is to *not* set the cluster's expected votes
+to include the quorum daemon's votes. Bring all nodes online, and start
+the quorum daemon *after* the whole cluster is running. This will allow
+the expected votes to increase naturally.
+
+.SH "2. Algorithms"
+.SH "2.1. Heartbeating & Liveliness Determination"
+Nodes update individual status blocks on the quorum disk at a user-
+defined rate. Each write of a status block alters the timestamp, which
+is what other nodes use to decide whether a node has hung or not. If,
+after a user-defined number of 'misses' (that is, failure to update a
+timestamp), a node is declared offline. After a certain number of 'hits'
+(changed timestamp + "i am alive" state), the node is declared online.
+
+The status block contains additional information, such as a bitmask of
+the nodes that node believes are online. Some of this information is
+used by the master - while some is just for performace recording, and
+may be used at a later time. The most important pieces of information
+a node writes to its status block are:
+
+.in 12
+- Timestamp
+.br
+- Internal state (available / not available)
+.br
+- Score
+.br
+- Known max score (may be used in the future to detect invalid configurations)
+.br
+- Vote/bid messages
+.br
+- Other nodes it thinks are online
+.in 0
+
+.SH "2.2. Scoring & Heuristics"
+The administrator can configure up to 10 purely arbitrary heuristics, and
+must exercise caution in doing so. At least one administrator-
+defined heuristic is required for operation, but it is generally a good
+idea to have more than one heuristic. By default, only nodes scoring over
+1/2 of the total maximum score will claim they are available via the
+quorum disk, and a node (master or otherwise) whose score drops too low
+will remove itself (usually, by rebooting).
+
+The heuristics themselves can be any command executable by 'sh -c'. For
+example, in early testing the following was used:
+
+.ti 12
+<\fBheuristic \fP\fIprogram\fP\fB="\fP[ -f /quorum ]\fB" \fP\fIscore\fP\fB="\fP10\fB" \fP\fIinterval\fP\fB="\fP2\fB"/>\fP
+
+This is a literal sh-ism which tests for the existence of a file called
+"/quorum". Without that file, the node would claim it was unavailable.
+This is an awful example, and should never, ever be used in production,
+but is provided as an example as to what one could do...
+
+Typically, the heuristics should be snippets of shell code or commands which
+help determine a node's usefulness to the cluster or clients. Ideally, you
+want to add traces for all of your network paths (e.g. check links, or
+ping routers), and methods to detect availability of shared storage.
+
+.SH "2.3. Master Election"
+Only one master is present at any one time in the cluster, regardless of
+how many partitions exist within the cluster itself. The master is
+elected by a simple voting scheme in which the lowest node which believes
+it is capable of running (i.e. scores high enough) bids for master status.
+If the other nodes agree, it becomes the master. This algorithm is
+run whenever no master is present.
+
+If another node comes online with a lower node ID while a node is still
+bidding for master status, it will rescind its bid and vote for the lower
+node ID. If a master dies or a bidding node dies, the voting algorithm
+is started over. The voting algorithm typically takes two passes to
+complete.
+
+Master deaths take marginally longer to recover from than non-master
+deaths, because a new master must be elected before the old master can
+be evicted & fenced.
+
+.SH "2.4. Master Duties"
+The master node decides who is or is not in the master partition, as
+well as handles eviction of dead nodes (both via the quorum disk and via
+the linux-cluster fencing system by using the cman_kill_node() API).
+
+.SH "2.5. How it All Ties Together"
+When a master is present, and if the master believes a node to be online,
+that node will advertise to CMAN that the quorum disk is available. The
+master will only grant a node membership if:
+
+.in 12
+(a) CMAN believes the node to be online, and
+.br
+(b) that node has made enough consecutive, timely writes
+.in 16
+to the quorum disk, and
+.in 12
+(c) the node has a high enough score to consider itself online.
+.in 0
+
+.SH "3. Configuration"
+.SH "3.1. The <quorumd> tag"
+This tag is a child of the top-level <cluster> tag.
+
+.in 8
+\fB<quorumd\fP
+.in 9
+\fIinterval\fP\fB="\fP1\fB"\fP
+.in 12
+This is the frequency of read/write cycles
+
+.in 9
+\fItko\fP\fB="\fP10\fB"\fP
+.in 12
+This is the number of cycles a node must miss in order to be declared dead.
+
+.in 9
+\fIvotes\fP\fB="\fP3\fB"\fP
+.in 12
+This is the number of votes the quorum daemon advertises to CMAN when it
+has a high enough score.
+
+.in 9
+\fIlog_level\fP\fB="\fP4\fB"\fP
+.in 12
+This controls the verbosity of the quorum daemon in the system logs.
+0 = emergencies; 7 = debug.
+
+.in 9
+\fIlog_facility\fP\fB="\fPlocal4\fB"\fP
+.in 12
+This controls the syslog facility used by the quorum daemon when logging.
+For a complete list of available facilities, see \fBsyslog.conf(5)\fP.
+
+.in 9
+\fIstatus_file\fP\fB="\fP/foo\fB"\fP
+.in 12
+Write internal states out to this file periodically ("-" = use stdout).
+This is primarily used for debugging.
+
+.in 9
+\fImin_score\fP\fB="\fP3\fB"\fP
+.in 12
+Absolute minimum score to be consider one's self "alive". If omitted,
+or set to 0, the default function "floor((n+1)/2)" is used, where \fIn\fP
+is the sum-total of all of defined heuristics' \fIscore\fP attribute.
+
+.in 9
+\fIdevice\fP\fB="\fP/dev/sda1\fB"\fP
+.in 12
+This is the device the quorum daemon will use. This device must be the
+same on all nodes.
+
+.in 9
+\fIlabel\fP\fB="\fPmylabel\fB"/>\fP
+.in 12
+This overrides the device field if present. If specified, the quorum
+daemon will read /proc/partitions and check for qdisk signatures
+on every block device found, comparing the label against the specified
+label. This is useful in configurations where the block device name
+differs on a per-node basis.
+.in 0
+
+.SH "3.2. The <heuristic> tag"
+This tag is a child of the <quorumd> tag.
+
+.in 8
+\fB<heuristic\fP
+.in 9
+\fIprogram\fP\fB="\fP/test.sh\fB"\fP
+.in 12
+This is the program used to determine if this heuristic is alive. This
+can be anything which may be executed by \fI/bin/sh -c\fP. A return
+value of zero indicates success; anything else indicates failure.
+
+.in 9
+\fIscore\fP\fB="\fP1\fB"\fP
+.in 12
+This is the weight of this heuristic. Be careful when determining scores
+for heuristics.
+
+.in 9
+\fIinterval\fP\fB="\fP2\fB"/>\fP
+.in 12
+This is the frequency at which we poll the heuristic.
+.in 0
+
+.SH "3.3. Example"
+.in 8
+<quorumd interval="1" tko="10" votes="3" label="testing">
+.in 12
+<heuristic program="ping A -c1 -t1" score="1" interval="2"/>
+.br
+<heuristic program="ping B -c1 -t1" score="1" interval="2"/>
+.br
+<heuristic program="ping C -c1 -t1" score="1" interval="2"/>
+.br
+.in 8
+</quorumd>
+.in 0
+
+.SH "3.4. Heuristic score considerations"
+* Heuristic timeouts should be set high enough to allow the previous run
+of a given heuristic to complete.
+
+* Heuristic scripts returning anything except 0 as their return code
+are considered failed.
+
+* The worst-case for improperly configured quorum heuristics is a race
+to fence where two partitions simultaneously try to kill each other.
+
+.SH "3.5. Creating a quorum disk partition"
+The mkqdisk utility can create and list currently configured quorum disks
+visible to the local node; see
+.B mkqdisk(8)
+for more details.
+
+.SH "SEE ALSO"
+mkqdisk(8), qdiskd(8), cman(5), syslog.conf(5)
/cvs/cluster/cluster/cman/man/qdiskd.8,v --> standard output
revision 1.2.4.1
--- cluster/cman/man/qdiskd.8
+++ - 2006-07-21 18:01:39.053646000 +0000
@@ -0,0 +1,20 @@
+.TH "qdiskd" "8" "July 2006" "" "Quorum Disk Management"
+.SH "NAME"
+qdiskd \- Cluster Quorum Disk Daemon
+.SH "SYNOPSIS"
+\fBqdiskd [\-f] [\-d]
+.SH "DESCRIPTION"
+.PP
+The \fBqdiskd\fP daemon talks to CMAN and provides a mechanism for determining
+node-fitness in a cluster environment. See
+.B
+qdisk(5)
+for configuration information.
+.SH "OPTIONS"
+.IP "\-f"
+Run in the foreground (do not fork / daemonize).
+.IP "\-d"
+Enable debug output.
+
+.SH "SEE ALSO"
+mkqdisk(8), qdisk(5), cman(5)
--- cluster/cman/man/Makefile 2004/08/13 06:38:22 1.1
+++ cluster/cman/man/Makefile 2006/07/21 18:01:38 1.1.8.1
@@ -18,10 +18,10 @@
install:
install -d ${mandir}/man5
install -d ${mandir}/man8
- install cman.5 ${mandir}/man5
- install cman_tool.8 ${mandir}/man8
+ install cman.5 qdisk.5 ${mandir}/man5
+ install cman_tool.8 qdiskd.8 mkqdisk.8 ${mandir}/man8
uninstall:
- ${UNINSTALL} cman.5 ${mandir}/man5
- ${UNINSTALL} cman_tool.8 ${mandir}/man8
+ ${UNINSTALL} cman.5 qdisk.5 ${mandir}/man5
+ ${UNINSTALL} cman_tool.8 qdiskd.8 mkqdisk.8 ${mandir}/man8
/cvs/cluster/cluster/cman/qdisk/Makefile,v --> standard output
revision 1.5.2.1
--- cluster/cman/qdisk/Makefile
+++ - 2006-07-21 18:01:39.244798000 +0000
@@ -0,0 +1,49 @@
+###############################################################################
+###############################################################################
+##
+## Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
+##
+## This copyrighted material is made available to anyone wishing to use,
+## modify, copy, or redistribute it subject to the terms and conditions
+## of the GNU General Public License v.2.
+##
+###############################################################################
+###############################################################################
+
+top_srcdir=..
+UNINSTALL=${top_srcdir}/scripts/uninstall.pl
+
+include ${top_srcdir}/make/defines.mk
+
+INCLUDES+=-I. -I../lib
+CFLAGS +=-I${incdir} -I${top_srcdir}/config \
+ -Wall -Werror -Wstrict-prototypes -Wshadow -D_GNU_SOURCE -g
+
+TARGET=qdiskd mkqdisk
+
+all: ${TARGET}
+
+copytobin: all
+ cp ${TARGET} ${top_srcdir}/bin
+
+install: ${TARGET}
+ install -d ${sbindir}
+ install ${TARGET} ${sbindir}
+
+qdiskd: disk.o crc32.o disk_util.o main.o score.o bitmap.o clulog.o \
+ gettid.o proc.o ../lib/libcman.a
+ gcc -o $@ $^ -lpthread -L../lib -lccs
+
+mkqdisk: disk.o crc32.o disk_util.o \
+ proc.o mkqdisk.o
+ gcc -o $@ $^
+
+
+%.o: %.c
+ $(CC) -c -o $@ $^ $(INCLUDES) $(CFLAGS)
+
+clean:
+ rm -f *.o ${TARGET}
+
+uninstall:
+ ${UNINSTALL} ${TARGET} ${sbindir}
/cvs/cluster/cluster/cman/qdisk/README,v --> standard output
revision 1.4.2.1
--- cluster/cman/qdisk/README
+++ - 2006-07-21 18:01:39.324979000 +0000
@@ -0,0 +1,274 @@
+qdisk 1.0 - a disk-based quorum algorithm for Linux-Cluster
+
+(C) 2006 Red Hat, Inc.
+
+1. Overview
+
+1.1. Problem
+
+In some situations, it may be necessary or desirable to sustain
+a majority node failure of a cluster without introducing the need for
+asymmetric (client-server, or heavy-weighted voting nodes).
+
+1.2. Design Requirements
+
+* Ability to sustain 1..(n-1)/n simultaneous node failures, without the
+danger of a simple network partition causing a split brain. That is, we
+need to be able to ensure that the majority failure case is not merely
+the result of a network partition.
+
+* Ability to use external reasons for deciding which partition is the
+the quorate partition in a partitioned cluster. For example, a user may
+have a service running on one node, and that node must always be the master
+in the event of a network partition. Or, a node might lose all network
+connectivity except the cluster communication path - in which case, a
+user may wish that node to be evicted from the cluster.
+
+* Integration with CMAN. We must not require CMAN to run with us (or
+without us). Linux-Cluster does not require a quorum disk normally -
+introducing new requirements on the base of how Linux-Cluster operates
+is not allowed.
+
+* Data integrity. In order to recover from a majority failure, fencing
+is required. The fencing subsystem is already provided by Linux-Cluster.
+
+* Non-reliance on hardware or protocol specific methods (i.e. SCSI
+reservations). This ensures the quorum disk algorithm can be used on the
+widest range of hardware configurations possible.
+
+* Little or no memory allocation after initialization. In critical paths
+during failover, we do not want to have to worry about being killed during
+a memory pressure situation because we request a page fault, and the Linux
+OOM killer responds...
+
+
+1.3. Hardware Configuration Considerations
+
+1.3.1. Concurrent, Synchronous, Read/Write Access
+
+This daemon requires a shared block device with concurrent read/write
+access from all nodes in the cluster. The shared block device can be
+a multi-port SCSI RAID array, a Fiber-Channel RAID SAN, a RAIDed iSCSI
+target, or even GNBD. The quorum daemon uses O_DIRECT to write to the
+device.
+
+1.3.2. Bargain-basement JBODs need not apply
+
+There is a minimum performance requirement inherent when using disk-based
+cluster quorum algorithms, so design your cluster accordingly. Using a
+cheap JBOD with old SCSI2 disks on a multi-initiator bus will cause
+problems at the first load spike. Plan your loads accordingly; a node's
+inability to write to the quorum disk in a timely manner will cause the
+cluster to evict the node. Using host-RAID or multi-initiator parallel
+SCSI configurations with the qdisk daemon is unlikely to work, and will
+probably cause administrators a lot of frustration. That having been
+said, because the timeouts are configurable, most hardware should work
+if the timeouts are set high enough.
+
+1.3.3. Fencing is Required
+
+In order to maintain data integrity under all failure scenarios, use of
+this quorum daemon requires adequate fencing, preferrably power-based
+fencing.
+
+
+1.4. Limitations
+
+* At this time, this daemon only supports a maximum of 16 nodes.
+
+* Cluster node IDs must be statically configured in cluster.conf and
+must be numbered from 1..16 (there can be gaps, of course).
+
+* Cluster node votes should be more or less equal.
+
+* CMAN must be running before the qdisk program can start. This
+limitation will be removed before a production release.
+
+* CMAN's eviction timeout should be at least 2x the quorum daemon's
+to give the quorum daemon adequate time to converge on a master during a
+failure + load spike situation.
+
+* The total number of votes assigned to the quorum device should be
+equal to or greater than the total number of node-votes in the cluster.
+While it is possible to assign only one (or a few) votes to the quorum
+device, the effects of doing so have not been explored.
+
+* Currently, the quorum disk daemon is difficult to use with CLVM if
+the quorum disk resides on a CLVM logical volume. CLVM requires a
+quorate cluster to correctly operate, which introduces a chicken-and-egg
+problem for starting the cluster: CLVM needs quorum, but the quorum daemon
+needs CLVM (if and only if the quorum device lies on CLVM-managed storage).
+One way to work around this is to *not* set the cluster's expected votes
+to include theh quorum daemon's votes. Bring all nodes online, and start
+the quorum daemon *after* the whole cluster is running. This will allow
+the expected votes to increase naturally.
+
+
+2. Algorithms
+
+2.1. Heartbeating & Liveliness Determination
+
+Nodes update individual status blocks on the quorum disk at a user-
+defined rate. Each write of a status block alters the timestamp, which
+is what other nodes use to decide whether a node has hung or not. If,
+after a user-defined number of 'misses' (that is, failure to update a
+timestamp), a node is declared offline. After a certain number of 'hits'
+(changed timestamp + "i am alive" state), the node is declared online.
+
+The status block contains additional information, such as a bitmask of
+the nodes that node believes are online. Some of this information is
+used by the master - while some is just for performace recording, and
+may be used at a later time. The most important pieces of information
+a node writes to its status block are:
+
+ - timestamp
+ - internal state (available / not available)
+ - score
+ - max score
+ - vote/bid messages
+ - other nodes it thinks are online
+
+
+2.2. Scoring & Heuristics
+
+The administrator can configure up to 10 purely arbitrary heuristics, and
+must exercise caution in doing so. By default, only nodes scoring over
+1/2 of the total maximum score will claim they are available via the
+quorum disk, and a node (master or otherwise) whose score drops too low
+will remove itself (usually, by rebooting).
+
+The heuristics themselves can be any command executable by 'sh -c'. For
+example, in early testing, I used this:
+
+ <heuristic program="[ -f /quorum ]" score="10" interval="2"/>
+
+This is a literal sh-ism which tests for the existence of a file called
+"/quorum". Without that file, the node would claim it was unavailable.
+This is an awful example, and should never, ever be used in production,
+but is provided as an example as to what one could do...
+
+Typically, the heuristics should be snippets of shell code or commands which
+help determine a node's usefulness to the cluster or clients. Ideally, you
+want to add traces for all of your network paths (e.g. check links, or
+ping routers), and methods to detect availability of shared storage.
+
+
+2.3. Master Election
+
+Only one master is present at any one time in the cluster, regardless of
+how many partitions exist within the cluster itself. The master is
+elected by a simple voting scheme in which the lowest node which believes
+it is capable of running (i.e. scores high enough) bids for master status.
+If the other nodes agree, it becomes the master. This algorithm is
+run whenever no master is present.
+
+If another node comes online with a lower node ID while a node is still
+bidding for master status, it will rescind its bid and vote for the lower
+node ID. If a master dies or a bidding node dies, the voting algorithm
+is started over. The voting algorithm typically takes two passes to
+complete.
+
+Master deaths take marginally longer to recover from than non-master
+deaths, because a new master must be elected before the old master can
+be evicted & fenced.
+
+
+2.4. Master Duties
+
+The master node decides who is or is not in the master partition, as
+well as handles eviction of dead nodes (both via the quorum disk and via
+the linux-cluster fencing system by using the cman_kill_node() API).
+
+
+2.5. How it All Ties Together
+
+When a master is present, and if the master believes a node to be online,
+that node will advertise to CMAN that the quorum disk is avilable. The
+master will only grant a node membership if:
+
+ (a) CMAN believes the node to be online, and
+ (b) that node has made enough consecutive, timely writes to the quorum
+ disk.
+
+
+3. Configuration
+
+3.1. The <quorumd> tag
+
+This tag is a child of the top-level <cluster> tag.
+
+ <quorumd
+ interval="1" This is the frequency of read/write cycles
+ tko="10" This is the number of cycles a node must miss
+ in order to be declared dead.
+ votes="3" This is the number of votes the quorum daemon
+ advertises to CMAN when it has a high enough
+ score.
+ log_level="4" This controls the verbosity of the quorum daemon
+ in the system logs. 0 = emergencies; 7 = debug
+ log_facility="local4" This controls the syslog facility used by the
+ quorum daemon when logging.
+ status_file="/foo" Write internal states out to this file
+ periodically ("-" = use stdout).
+ min_score="3" Absolute minimum score to be consider one's
+ self "alive". If omitted, or set to 0, the
+ default function "floor((n+1)/2)" is used.
+ device="/dev/sda1" This is the device the quorum daemon will use.
+ This device must be the same on all nodes.
+ label="mylabel"/> This overrides the device field if present.
+ If specified, the quorum daemon will read
+ /proc/partitions and check for qdisk signatures
+ on every block device found, comparing the label
+ against the specified label. This is useful in
+ configurations where the block device name
+ differs on a per-node basis.
+
+
+3.2. The <heuristic> tag
+
+This tag is a child of the <quorumd> tag.
+
+ <heuristic
+ program="/test.sh" This is the program used to determine if this
+ heuristic is alive. This can be anything which
+ may be executed by "/bin/sh -c". A return value
+ of zero indicates success.
+ score="1" This is the weight of this heuristic. Be careful
+ when determining scores for heuristics.
+ interval="2"/> This is the frequency at which we poll the
+ heuristic.
+
+3.3. Example
+
+ <quorumd interval="1" tko="10" votes="3" device="/dev/gnbd/qdisk">
+ <heuristic program="ping routerA -c1 -t1" score="1" interval="2"/>
+ <heuristic program="ping routerB -c1 -t1" score="1" interval="2"/>
+ <heuristic program="ping routerC -c1 -t1" score="1" interval="2"/>
+ </quorumd>
+
+3.4. Heuristic score considerations
+
+* Heuristic timeouts should be set high enough to allow the previous run
+of a given heuristic to complete.
+
+* Heuristic scripts returning anything except 0 as their return code
+are considered failed.
+
+* The worst-case for improperly configured quorum heuristics is a race
+to fence where two partitions simultaneously try to kill each other.
+
+3.5. Creating a quorum disk partition
+
+3.5.1. The mkqdisk utility.
+
+The mkqdisk utility can create and list currently configured quorum disks
+visible to the local node.
+
+ mkqdisk -L List available quorum disks.
+
+ mkqdisk -f <label> Find a quorum device by the given label.
+
+ mkqdisk -c <device> -l <label>
+ Initialize <device> and name it <label>. This
+ will destroy all data on the device, so be careful
+ when running this command.
/cvs/cluster/cluster/cman/qdisk/bitmap.c,v --> standard output
revision 1.2.2.1
--- cluster/cman/qdisk/bitmap.c
+++ - 2006-07-21 18:01:39.411387000 +0000
@@ -0,0 +1,107 @@
+/*
+ Copyright Red Hat, Inc. 2002-2003, 2006
+
+ The Red Hat Cluster Manager API Library is free software; you can
+ redistribute it and/or modify it under the terms of the GNU Lesser
+ General Public License as published by the Free Software Foundation;
+ either version 2.1 of the License, or (at your option) any later
+ version.
+
+ The Red Hat Cluster Manager API Library is distributed in the hope
+ that it will be useful, but WITHOUT ANY WARRANTY; without even the
+ implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+ PURPOSE. See the GNU Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+ USA.
+ */
+/** @file
+ * Bitmap and membership mask handling routines.
+ */
+#include <stdint.h>
+
+
+/**
+ * Clear a bit in a bitmap / bitmask.
+ *
+ * @param mask Bitmask to modify.
+ * @param bitidx Bit to modify.
+ * @param masklen Bitmask length (in uint8_t units)
+ * @return -1 if the index exceeds the number of bits in the
+ * bitmap, otherwise 0.
+ */
+int
+clear_bit(uint8_t *mask, uint32_t bitidx, uint32_t masklen)
+{
+ uint32_t idx;
+ uint32_t bit;
+
+ /* Index into array */
+ idx = bitidx >> 3;
+ bit = 1 << (bitidx & 0x7);
+
+ if (idx >= masklen)
+ return -1;
+
+ mask[idx] &= ~bit;
+
+ return 0;
+}
+
+
+/**
+ * Set a bit in a bitmap / bitmask.
+ *
+ * @param mask Bitmask to modify.
+ * @param bitidx Bit to modify.
+ * @param masklen Bitmask length (in uint8_t units).
+ * @return -1 if the index exceeds the number of bits in the
+ * bitmap, otherwise 0.
+ */
+int
+set_bit(uint8_t *mask, uint32_t bitidx, uint32_t masklen)
+{
+ uint32_t idx;
+ uint32_t bit;
+
+ /* Index into array */
+ idx = bitidx >> 3;
+ bit = 1 << (bitidx & 0x7);
+
+ if (idx >= masklen)
+ return -1;
+
+ mask[idx] |= bit;
+
+ return 0;
+}
+
+
+/**
+ * Check the status of a bit in a bitmap / bitmask.
+ *
+ * @param mask Bitmask to check.
+ * @param bitidx Bit to to check.
+ * @param masklen Bitmask length (in uint8_t units).
+ * @return -1 if the index exceeds the number of bits in the
+ * bitmap, 0 if not set, or 1 if set.
+ */
+int
+is_bit_set(uint8_t *mask, uint32_t bitidx, uint32_t masklen)
+{
+ uint32_t idx;
+ uint32_t bit;
+
+ /* Index into array */
+ idx = bitidx >> 3;
+ bit = 1 << (bitidx & 0x7);
+
+ if (idx >= masklen)
+ return -1;
+
+ return !!(mask[idx]&bit);
+}
+
+
/cvs/cluster/cluster/cman/qdisk/clulog.c,v --> standard output
revision 1.2.2.1
--- cluster/cman/qdisk/clulog.c
+++ - 2006-07-21 18:01:39.508895000 +0000
@@ -0,0 +1,296 @@
+/*
+ Copyright Red Hat, Inc. 2002
+ Copyright Mission Critical Linux, 2000
+
+ This program is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by the
+ Free Software Foundation; either version 2, or (at your option) any
+ later version.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; see the file COPYING. If not, write to the
+ Free Software Foundation, Inc., 675 Mass Ave, Cambridge,
+ MA 02139, USA.
+*/
+/** @file
+ * Library routines for communicating with the logging daemon.
+ *
+ * $Id: clulog.c,v 1.2.2.1 2006/07/21 18:01:38 lhh Exp $
+ *
+ * Author: Jeff Moyer <moyer at missioncriticallinux.com>
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <stdarg.h>
+#include <malloc.h>
+#include <dirent.h>
+#include <signal.h>
+#include <sys/errno.h>
+#include <sys/types.h>
+#include <sys/file.h>
+#include <sys/socket.h>
+#include <ccs.h>
+#define SYSLOG_NAMES
+#include <sys/syslog.h>
+#undef SYSLOG_NAMES
+
+#include <sys/wait.h>
+#include <sys/types.h>
+#include <linux/unistd.h>
+#include <pthread.h>
+#include <gettid.h>
+#include <clulog.h>
+#include <string.h>
+
+
+static const char *version __attribute__ ((unused)) = "$Revision: 1.2.2.1 $";
+
+#ifdef DEBUG
+#include <assert.h>
+#define Dprintf(fmt,args...) printf(fmt,##args)
+#define DBG_ASSERT(x) assert(x)
+#else
+#define Dprintf(fmt,args...)
+#define DBG_ASSERT(x)
+#endif
+
+/*
+ * Globals
+ */
+static int log_is_open = 0;
+static int useconsole = 0;
+static int loglevel = LOGLEVEL_DFLT;
+static int syslog_facility = LOG_DAEMON;
+static char *daemon_name = NULL;
+static pid_t daemon_pid = -1;
+static pthread_mutex_t log_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+CODE logger_prioritynames[] =
+{ {"emerg", LOG_EMERG},
+ {"alert", LOG_ALERT},
+ {"crit", LOG_CRIT},
+ {"err", LOG_ERR},
+ {"warning", LOG_WARNING},
+ {"notice", LOG_NOTICE},
+ {"info", LOG_INFO},
+ {"debug", LOG_DEBUG}
+};
+
+/*
+ * Exported Functions.
+ */
+
+/**
+ * @return The current cluster log level.
+ */
+int
+clu_get_loglevel(void)
+{
+ return loglevel;
+}
+
+
+/**
+ * Set the cluster log level.
+ *
+ * @param severity New log level.
+ * @return Old log level, or -1 if 'severity' is an invalid log
+ * level.
+ */
+int
+clu_set_loglevel(int severity)
+{
+ int ret = loglevel;
+
+ if (severity > 0) {
+ loglevel = severity;
+ return ret;
+ }
+
+ return -1;
+}
+
+
+/**
+ * @return The current cluster log facility.
+ */
+char *
+clu_get_facility(void)
+{
+ int x = 0;
+
+ pthread_mutex_lock(&log_mutex);
+ for (; facilitynames[x].c_name; x++) {
+ if (syslog_facility == facilitynames[x].c_val) {
+ pthread_mutex_unlock(&log_mutex);
+ return facilitynames[x].c_name;
+ }
+ }
+
+ pthread_mutex_unlock(&log_mutex);
+ return "local4";
+}
+
+
+/**
+ * Set the cluster log facility.
+ *
+ * @param facilityname New log facility (see /usr/include/sys/syslog.h).
+ * @return 0
+ */
+int
+clu_set_facility(char *facilityname)
+{
+ int x = 0, old;
+
+ pthread_mutex_lock(&log_mutex);
+ old = syslog_facility;
+
+ for (; facilitynames[x].c_name; x++) {
+ if (strcmp(facilityname, facilitynames[x].c_name))
+ continue;
+
+ syslog_facility = facilitynames[x].c_val;
+ break;
+ }
+
+ if (syslog_facility == old) {
+ pthread_mutex_unlock(&log_mutex);
+ return 0;
+ }
+
+ closelog();
+ log_is_open = 0;
+ pthread_mutex_unlock(&log_mutex);
+ return 0;
+}
+
+
+/**
+ * Set the console logging mode. Does not work for daemons.
+ *
+ * @param onoff 0 = off, otherwise on.
+ * @return Old log-to-console state.
+ */
+int
+clu_log_console(int onoff)
+{
+ int ret = useconsole;
+
+ useconsole = !!onoff;
+ return ret;
+}
+
+
+/**
+ * Cluster logging function. Talks to syslog and writes to the
+ * console, if necessary.
+ */
+int
+do_clulog(int severity,
+ int write_to_cons,
+ pid_t pid,
+ char *prog,
+ const char *fmt, ...)
+{
+ va_list args;
+ char logmsg[MAX_LOGMSG_LEN]; /* message to go to the log */
+ char printmsg[MAX_LOGMSG_LEN]; /* message to go to stdout */
+ int syslog_flags = LOG_NDELAY;
+
+ pthread_mutex_lock(&log_mutex);
+ if (severity > loglevel) {
+ pthread_mutex_unlock(&log_mutex);
+ return 0;
+ }
+
+ memset(logmsg, 0, MAX_LOGMSG_LEN);
+ memset(printmsg, 0, MAX_LOGMSG_LEN);
+
+ /*
+ * Check to see if the caller has forked.
+ */
+ if (!pid) {
+
+ /* Use thread IDs */
+ if (daemon_pid != gettid()) {
+
+ daemon_pid = gettid();
+ log_is_open = 0;
+ }
+
+ syslog_flags |= LOG_PID;
+
+ } else {
+
+ daemon_pid = pid;
+ closelog();
+ log_is_open = 0;
+ snprintf(logmsg, MAX_LOGMSG_LEN, "[%d]: ", pid);
+ }
+
+ if (prog) {
+
+ if (daemon_name) {
+
+ free(daemon_name);
+ daemon_name = NULL;
+ }
+
+ daemon_name = strdup(prog);
+ }
+
+ if (!log_is_open) {
+
+ openlog(daemon_name, syslog_flags, syslog_facility);
+ log_is_open = 1;
+ }
+ /*
+ * Note: This can be called in the context of a CGI program, in which
+ * case anything printed to stdout goes to the web page. This can
+ * cause problems if we have our standard <warning> strings b/c
+ * the web client will try to interpret this as an html tag.
+ */
+ snprintf(logmsg + strlen(logmsg), MAX_LOGMSG_LEN - strlen(logmsg),
+ "<%s> ", logger_prioritynames[severity].c_name);
+
+ va_start(args, fmt);
+ vsnprintf(logmsg + strlen(logmsg), MAX_LOGMSG_LEN - strlen(logmsg),
+ fmt, args);
+ va_end(args);
+
+ if (write_to_cons || useconsole) {
+ snprintf(printmsg, MAX_LOGMSG_LEN, "[%d] %s: ", daemon_pid,
+ logger_prioritynames[severity].c_name);
+
+ va_start(args, fmt);
+ vsnprintf(printmsg + strlen(printmsg),
+ MAX_LOGMSG_LEN - strlen(printmsg), fmt, args);
+ va_end(args);
+
+ fprintf(stdout, "%s", printmsg);
+ }
+
+ syslog(severity, logmsg);
+
+ pthread_mutex_unlock(&log_mutex);
+
+ return 0;
+}
+
+
+/**
+ * Stop the cluster logging facility.
+ */
+void
+clulog_close(void)
+{
+ closelog();
+}
/cvs/cluster/cluster/cman/qdisk/clulog.h,v --> standard output
revision 1.2.2.1
--- cluster/cman/qdisk/clulog.h
+++ - 2006-07-21 18:01:39.595061000 +0000
@@ -0,0 +1,161 @@
+/*
+ Copyright Red Hat, Inc. 2002
+ Copyright Mission Critical Linux, 2000
+
+ This program is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by the
+ Free Software Foundation; either version 2, or (at your option) any
+ later version.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; see the file COPYING. If not, write to the
+ Free Software Foundation, Inc., 675 Mass Ave, Cambridge,
+ MA 02139, USA.
+*/
+/** @file
+ * Header for clulog.c
+ */
+/*
+ * author: Jeff Moyer <moyer at missioncriticallinux.com>
+ */
+
+#ifndef __CLUSTER_LOG_H
+#define __CLUSTER_LOG_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <syslog.h>
+#include <sys/types.h>
+
+#define LOGLEVEL_DFLT LOG_INFO
+#define MAX_LOGMSG_LEN 512
+
+/*
+ * int clu_set_loglevel(int severity)
+ *
+ * DESCRIPTION
+ * Set the logging level for this daemon. This is not a
+ * system-wide setting.
+ *
+ * ARGUMENTS
+ * severity Severity as documented in sys/syslog.h (i.e. LOG_ERR)
+ *
+ * RETURN VALUES
+ * On success, the previous loglevel is returned. On error -1 is returned.
+ *
+ * NOTES
+ * The only way of generating errors for this call is to give a negative
+ * value for severity. Currently, syslog lists severities up to 8, but
+ * I see no reason for this restriction if, in the future, we decided to
+ * add more levels. Thus, any number up to MAXINT will be supported.
+ */
+int clu_set_loglevel(int severity);
+int clu_set_facility(char *facility);
+int clu_log_console(int onoff);
+
+/*
+ * int clu_get_loglevel(void)
+ *
+ * DESCRIPTION
+ * Get the current logging level.
+ *
+ * ARGUMENTS
+ * none
+ *
+ * RETURN VALUES
+ * The current logging level is returned.
+ */
+int clu_get_loglevel(void);
+
+/*
+ * DESCRIPTION
+ * Cluster logging facility. This is the actual function that does the
+ * logging. No one should call this, you should call the wrappers provided.
+ * i.e. clulog and clulog_and_print.
+ */
+int do_clulog(int severity, int write_to_cons, pid_t pid,
+ char *prog, const char *fmt, ...);
+/*
+ * int clulog(int severity, const char *fmt, ...)
+ *
+ * DESCRIPTION
+ * Cluster logging facility. This is a library routine which sends the
+ * supplied parameters to the syslog daemon. If the supplied severity is
+ * numerically larger than the current loglevel, the message is never sent
+ * to the log.
+ *
+ * ARGUMENTS
+ * severity Severity as documented in sys/syslog.h (i.e. LOG_ERR)
+ * fmt Format string as used with printf.
+ *
+ * RETURN VALUES
+ * On success, 0 is returned. On error, -1 is returned.
+ *
+ * NOTES
+ * Inability to contact the logging daemon is the only source of error
+ * for this function. Thus, it would behoove you to try a clulog before
+ * daemonizing your process. If it fails, print a message to stderr
+ * explaining that the cluster logging daemon should probably be started.
+ * If you really want your message to be heard by someone, use
+ * clulog_and_print().
+ */
+#define clulog(x,fmt,args...) do_clulog(x,0,0,NULL,fmt,##args)
+#define clulog_pid(x,pid,prog,fmt,args...) do_clulog(x,0,pid,prog,fmt,##args)
+
+/*
+ * int clulog_and_print(int severity, int write_to_cons, const char *fmt, ...)
+ *
+ * DESCRIPTION
+ * Cluster logging facility. This is a library routine which sends the
+ * supplied parameters to the syslog daemon. If the supplied severity is
+ * numerically larger than the current loglevel, the message is never sent
+ * to the log. This version also prints the given message to the terminal.
+ *
+ * ARGUMENTS
+ * severity Severity as documented in sys/syslog.h (i.e. LOG_ERR)
+ * fmt Format string as used with printf.
+ *
+ * RETURN VALUES
+ * On success, 0 is returned. On error, -1 is returned.
+ */
+#define clulog_and_print(x,fmt,args...) do_clulog(x,1,0,NULL,fmt,##args)
+
+
+/*
+ * void clulog_close(void)
+ *
+ * DESCRIPTION
+ * This is an optional call to close the logfile. This translates into a
+ * closelog() call.
+ *
+ * ARGUMENTS
+ * none
+ *
+ * RETURN VALUES
+ * This function does not return anything.
+ */
+void clulog_close(void);
+
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* __CLUSTER_LOG_H */
+/*
+ * Local variables:
+ * c-basic-offset: 8
+ * c-indent-level: 8
+ * tab-width: 8
+ * End:
+ */
/cvs/cluster/cluster/cman/qdisk/crc32.c,v --> standard output
revision 1.2.2.1
--- cluster/cman/qdisk/crc32.c
+++ - 2006-07-21 18:01:39.679326000 +0000
@@ -0,0 +1,125 @@
+/*
+ * Copyright (C) 2000 Bryan Call <bc at fodder.org>
+ *
+ * Modified by Lon H. Hohberger <lhh at redhat.com>
+ * Copyright (C) 2003 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+/** @file
+ * Calculates CRC32s on data.
+ */
+
+#include <stdint.h>
+#include <sys/types.h>
+#include <stdio.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+static const unsigned long crctable[256] = {
+ 0x00000000, 0x77073096, 0xee0e612c, 0x990951ba,
+ 0x076dc419, 0x706af48f, 0xe963a535, 0x9e6495a3,
+ 0x0edb8832, 0x79dcb8a4, 0xe0d5e91e, 0x97d2d988,
+ 0x09b64c2b, 0x7eb17cbd, 0xe7b82d07, 0x90bf1d91,
+ 0x1db71064, 0x6ab020f2, 0xf3b97148, 0x84be41de,
+ 0x1adad47d, 0x6ddde4eb, 0xf4d4b551, 0x83d385c7,
+ 0x136c9856, 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec,
+ 0x14015c4f, 0x63066cd9, 0xfa0f3d63, 0x8d080df5,
+ 0x3b6e20c8, 0x4c69105e, 0xd56041e4, 0xa2677172,
+ 0x3c03e4d1, 0x4b04d447, 0xd20d85fd, 0xa50ab56b,
+ 0x35b5a8fa, 0x42b2986c, 0xdbbbc9d6, 0xacbcf940,
+ 0x32d86ce3, 0x45df5c75, 0xdcd60dcf, 0xabd13d59,
+ 0x26d930ac, 0x51de003a, 0xc8d75180, 0xbfd06116,
+ 0x21b4f4b5, 0x56b3c423, 0xcfba9599, 0xb8bda50f,
+ 0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924,
+ 0x2f6f7c87, 0x58684c11, 0xc1611dab, 0xb6662d3d,
+ 0x76dc4190, 0x01db7106, 0x98d220bc, 0xefd5102a,
+ 0x71b18589, 0x06b6b51f, 0x9fbfe4a5, 0xe8b8d433,
+ 0x7807c9a2, 0x0f00f934, 0x9609a88e, 0xe10e9818,
+ 0x7f6a0dbb, 0x086d3d2d, 0x91646c97, 0xe6635c01,
+ 0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e,
+ 0x6c0695ed, 0x1b01a57b, 0x8208f4c1, 0xf50fc457,
+ 0x65b0d9c6, 0x12b7e950, 0x8bbeb8ea, 0xfcb9887c,
+ 0x62dd1ddf, 0x15da2d49, 0x8cd37cf3, 0xfbd44c65,
+ 0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2,
+ 0x4adfa541, 0x3dd895d7, 0xa4d1c46d, 0xd3d6f4fb,
+ 0x4369e96a, 0x346ed9fc, 0xad678846, 0xda60b8d0,
+ 0x44042d73, 0x33031de5, 0xaa0a4c5f, 0xdd0d7cc9,
+ 0x5005713c, 0x270241aa, 0xbe0b1010, 0xc90c2086,
+ 0x5768b525, 0x206f85b3, 0xb966d409, 0xce61e49f,
+ 0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4,
+ 0x59b33d17, 0x2eb40d81, 0xb7bd5c3b, 0xc0ba6cad,
+ 0xedb88320, 0x9abfb3b6, 0x03b6e20c, 0x74b1d29a,
+ 0xead54739, 0x9dd277af, 0x04db2615, 0x73dc1683,
+ 0xe3630b12, 0x94643b84, 0x0d6d6a3e, 0x7a6a5aa8,
+ 0xe40ecf0b, 0x9309ff9d, 0x0a00ae27, 0x7d079eb1,
+ 0xf00f9344, 0x8708a3d2, 0x1e01f268, 0x6906c2fe,
+ 0xf762575d, 0x806567cb, 0x196c3671, 0x6e6b06e7,
+ 0xfed41b76, 0x89d32be0, 0x10da7a5a, 0x67dd4acc,
+ 0xf9b9df6f, 0x8ebeeff9, 0x17b7be43, 0x60b08ed5,
+ 0xd6d6a3e8, 0xa1d1937e, 0x38d8c2c4, 0x4fdff252,
+ 0xd1bb67f1, 0xa6bc5767, 0x3fb506dd, 0x48b2364b,
+ 0xd80d2bda, 0xaf0a1b4c, 0x36034af6, 0x41047a60,
+ 0xdf60efc3, 0xa867df55, 0x316e8eef, 0x4669be79,
+ 0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236,
+ 0xcc0c7795, 0xbb0b4703, 0x220216b9, 0x5505262f,
+ 0xc5ba3bbe, 0xb2bd0b28, 0x2bb45a92, 0x5cb36a04,
+ 0xc2d7ffa7, 0xb5d0cf31, 0x2cd99e8b, 0x5bdeae1d,
+ 0x9b64c2b0, 0xec63f226, 0x756aa39c, 0x026d930a,
+ 0x9c0906a9, 0xeb0e363f, 0x72076785, 0x05005713,
+ 0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38,
+ 0x92d28e9b, 0xe5d5be0d, 0x7cdcefb7, 0x0bdbdf21,
+ 0x86d3d2d4, 0xf1d4e242, 0x68ddb3f8, 0x1fda836e,
+ 0x81be16cd, 0xf6b9265b, 0x6fb077e1, 0x18b74777,
+ 0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c,
+ 0x8f659eff, 0xf862ae69, 0x616bffd3, 0x166ccf45,
+ 0xa00ae278, 0xd70dd2ee, 0x4e048354, 0x3903b3c2,
+ 0xa7672661, 0xd06016f7, 0x4969474d, 0x3e6e77db,
+ 0xaed16a4a, 0xd9d65adc, 0x40df0b66, 0x37d83bf0,
+ 0xa9bcae53, 0xdebb9ec5, 0x47b2cf7f, 0x30b5ffe9,
+ 0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6,
+ 0xbad03605, 0xcdd70693, 0x54de5729, 0x23d967bf,
+ 0xb3667a2e, 0xc4614ab8, 0x5d681b02, 0x2a6f2b94,
+ 0xb40bbe37, 0xc30c8ea1, 0x5a05df1b, 0x2d02ef8d,
+};
+
+
+/**
+ * Calculate CRC32 of a data set.
+ *
+ * @param data Data set for building CRC32
+ * @param count Size of data set, in bytes.
+ * @return CRC32 of data set.
+ */
+uint32_t clu_crc32(const char *data, size_t count)
+{
+ uint32_t x;
+ uint32_t crc = (uint32_t)~0;
+
+ for (x = 0; x < count; x++)
+ crc = (crc >> 8) ^ crctable[(crc ^ data[x]) & 0xff];
+
+ if (crc == (uint32_t)~0)
+ return 0;
+ return ~crc;
+}
+
+#if 0
+int
+main(int argc, const char **argv)
+{
+ printf("%08x\n",crc32(argv[1],strlen(argv[1])));
+}
+#endif
/cvs/cluster/cluster/cman/qdisk/disk.c,v --> standard output
revision 1.4.2.1
--- cluster/cman/qdisk/disk.c
+++ - 2006-07-21 18:01:39.760295000 +0000
@@ -0,0 +1,758 @@
+/*
+ Copyright Red Hat, Inc. 2002-2003, 2006
+ Copyright Mission Critical Linux, 2000
+
+ This program is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by the
+ Free Software Foundation; either version 2, or (at your option) any
+ later version.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR lgPURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; see the file COPYING. If not, write to the
+ Free Software Foundation, Inc., 675 Mass Ave, Cambridge,
+ MA 02139, USA.
+*/
+/** @file
+ * Single-block Raw/Direct I/O Functions
+ */
+/*
+ * author: Tim Burke <tburke at redhat.com>
+ * description: Raw IO Interfaces.
+ *
+ * The RAW IO code we are using from 2.2.13 requires user buffers and
+ * disk offsets to be 512 byte aligned. So this code consists of a
+ * read and write routine which check to see if the user buffer is
+ * aligned. If it isn't a temporary aligned buffer is allocated, a data
+ * copy is performed along with the IO operation itself.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/ioctl.h>
+#include <string.h>
+#include <errno.h>
+#include <disk.h>
+#include <platform.h>
+#include <unistd.h>
+#include <time.h>
+
+static int diskRawRead(int fd, char *buf, int len);
+uint32_t clu_crc32(const char *data, size_t count);
+
+
+/**
+ * Swap the bytes of a shared header so that it's always in big-endian form
+ * when stored on disk.
+ *
+ * @param hdr Header to encode.
+ */
+static void
+header_encode(shared_header_t *hdr)
+{
+ /* sanity check - LE machine -> already encoded. */
+ if (hdr->h_magic == be_swap32(SHARED_HEADER_MAGIC))
+ return;
+
+ swab32(hdr->h_magic);
+ swab32(hdr->h_hcrc);
+ swab32(hdr->h_dcrc);
+ swab32(hdr->h_length);
+ swab64(hdr->h_view);
+ swab64(hdr->h_timestamp);
+}
+
+
+/**
+ * Swap the bytes of a shared header so that it's always in host-byte order
+ * after we read it. This should be a macro calling header_encode.
+ *
+ * @param hdr Header to decode.
+ */
+static void
+header_decode(shared_header_t *hdr)
+{
+ /* sanity check - LE machine -> already decoded. */
+ if (hdr->h_magic == SHARED_HEADER_MAGIC)
+ return;
+
+ swab32(hdr->h_magic);
+ swab32(hdr->h_hcrc);
+ swab32(hdr->h_dcrc);
+ swab32(hdr->h_length);
+ swab64(hdr->h_view);
+ swab64(hdr->h_timestamp);
+}
+
+
+/**
+ * Generate a shared header suitable for storing data. This includes:
+ * header magic, header crc, data crc, header length, timestamp.
+ * The header CRC is generated *after* the data CRC; so the header,
+ * in effect, ensures that the data CRC is valid before we even look
+ * at the data. Thus, if the header CRC decodes properly, then we
+ * assume that there's a very very high chance that the data CRC is valid.
+ * If the data CRC doesn't match the data, it's indicative of a problem.
+ *
+ * @param hdr Preallocated pointer to shared_header_t structure.
+ * @param data Data to be stored with hdr.
+ * @param count Size of data.
+ * @return -1 if CRC32 generation fails, or 0 on success.
+ */
+static int
+header_generate(shared_header_t *hdr, const char *data, size_t count)
+{
+ memset(hdr,0,sizeof(*hdr));
+
+ hdr->h_magic = SHARED_HEADER_MAGIC;
+
+ if (data && count) {
+ hdr->h_dcrc = clu_crc32(data, count);
+ hdr->h_length = (uint32_t)count;
+
+ if (hdr->h_dcrc == 0) {
+ fprintf(stderr, "Invalid CRC32 generated on data!\n");
+ return -1;
+ }
+ }
+
+ hdr->h_timestamp = (uint64_t)time(NULL);
+
+ hdr->h_hcrc = clu_crc32((char *)hdr, sizeof(*hdr));
+ if (hdr->h_hcrc == 0) {
+ fprintf(stderr, "Invalid CRC32 generated on header!\n");
+ return -1;
+ }
+
+ header_encode(hdr);
+
+ return 0;
+}
+
+
+/**
+ * Verify the integrity of a shared header. Basically, check the CRC32
+ * information against the data and header. A better name for this would
+ * be "shared_block_verify".
+ *
+ * @param hdr Preallocated pointer to shared_header_t structure.
+ * @param data Data to be stored with hdr.
+ * @param count Size of data.
+ * @return -1 if CRC32 generation fails, or 0 on success.
+ */
+static int
+header_verify(shared_header_t *hdr, const char *data, size_t count)
+{
+ uint32_t crc;
+ uint32_t bkupcrc;
+
+ header_decode(hdr);
+ /*
+ * verify the header's CRC32. Ok, we know it's overkill taking
+ * the CRC32 of a friggin' 16-byte (12 bytes, really) structure,
+ * but why not?
+ */
+ bkupcrc = hdr->h_hcrc;
+ hdr->h_hcrc = 0;
+ crc = clu_crc32((char *)hdr, sizeof(*hdr));
+ hdr->h_hcrc = bkupcrc;
+ if (bkupcrc != crc) {
+#if 0
+ fprintf(stderr, "Header CRC32 mismatch; Exp: 0x%08x "
+ "Got: 0x%08x\n", bkupcrc, crc);
+#endif
+ return -1;
+ }
+
+ /*
+ * Verify the magic number.
+ */
+ if (hdr->h_magic != SHARED_HEADER_MAGIC) {
+#if 0
+ fprintf(stderr, "Magic mismatch; Exp: 0x%08x "
+ "Got: 0x%08x\n", SHARED_HEADER_MAGIC, hdr->h_magic);
+#endif
+ return -1;
+ }
+
+ /*
+ * If there's no data or no count, or perhaps the length fed in is less
+ * then the expected length, bail.
+ */
+ if (!data || !count || (count < hdr->h_length))
+ return 0;
+
+ crc = clu_crc32(data, (count > hdr->h_length) ?
+ hdr->h_length : count);
+
+ if (hdr->h_dcrc != crc) {
+#if 0
+ fprintf(stderr, "Data CRC32 mismatch; Exp: 0x%08x "
+ "Got: 0x%08x\n", hdr->h_dcrc, crc);
+#endif
+ return -1;
+ }
+
+ return 0;
+}
+
+
+
+/*
+ * qdisk_open
+ * Called to open the shared state partition with appropriate mode.
+ * Returns - (the file descriptor), a value >= 0 on success.
+ */
+int
+qdisk_open(char *name)
+{
+ int fd;
+ int retval;
+
+ /*
+ * Open for synchronous writes to insure all writes go directly
+ * to disk.
+ */
+ fd = open(name, O_RDWR | O_SYNC | O_DIRECT);
+ if (fd < 0) {
+ return fd;
+ }
+
+ /* Check to verify that the partition is large enough.*/
+ retval = lseek(fd, END_OF_DISK, SEEK_SET);
+
+ if (retval < 0) {
+ perror("open_partition: seek");
+ return -1;
+ }
+
+ if (retval < END_OF_DISK) {
+ fprintf(stderr, "Partition %s too small\n", name);
+ errno = EINVAL;
+ return -1;
+ }
+
+ /* Set close-on-exec bit */
+ retval = fcntl(fd, F_GETFD, 0);
+ if (retval < 0) {
+ close(fd);
+ return -1;
+ }
+
+ retval |= FD_CLOEXEC;
+ if (fcntl(fd, F_SETFD, retval) < 0) {
+ perror("open_partition: fcntl");
+ close(fd);
+ return -1;
+ }
+
+ return fd;
+}
+
+
+/*
+ * qdisk_close
+ * Closes the shared state disk partition.
+ * Returns - value from close syscall.
+ */
+int
+qdisk_close(int *fd)
+{
+ int retval;
+
+ if (!fd || *fd < 0) {
+ errno = EINVAL;
+ return -1;
+ }
+
+ retval = close(*fd);
+ *fd = -1;
+
+ return retval;
+}
+
+/*
+ * qdisk_validate
+ * Called to verify that the specified device special file representing
+ * the partition appears to be a valid device.
+ * Returns: 0 - success, 1 - failure
+ */
+int
+qdisk_validate(char *name)
+{
+ struct stat stat_st, *stat_ptr;
+ int fd;
+ stat_ptr = &stat_st;
+
+ if (stat(name, stat_ptr) < 0) {
+ perror("stat");
+ return -1;
+ }
+ /*
+ * Verify that its a block or character special file.
+ */
+ if (S_ISCHR(stat_st.st_mode) == 0 && S_ISBLK(stat_st.st_mode) == 0) {
+/*
+ errno = EINVAL;
+ return -1;
+*/
+ fprintf(stderr, "Warning: %s is not a block device\n",
+ name);
+ }
+
+ /*
+ * Verify read/write permission.
+ */
+ fd = qdisk_open(name);
+ if (fd < 0) {
+ fprintf(stderr, "%s: open of %s for RDWR failed: %s\n",
+ __FUNCTION__, name, strerror(errno));
+ return -1;
+ }
+ qdisk_close(&fd);
+ return 0;
+}
+
+
+static int
+diskRawReadShadow(int fd, off_t readOffset, char *buf, int len)
+{
+ int ret;
+ shared_header_t *hdrp;
+ char *data;
+ int datalen;
+
+ ret = lseek(fd, readOffset, SEEK_SET);
+ if (ret != readOffset) {
+#if 0
+ fprintf(stderr,
+ "diskRawReadShadow: can't seek to offset %d.\n",
+ (int) readOffset);
+#endif
+ errno = ENODATA;
+ return -1;
+ }
+
+ ret = diskRawRead(fd, buf, len);
+ if (ret != len) {
+#if 0
+ fprintf(stderr, "diskRawReadShadow: aligned read "
+ "returned %d, not %d.\n", ret, len);
+#endif
+ errno = ENODATA;
+ return -1;
+ }
+
+ /* Decode the header portion so we can run a checksum on it. */
+ hdrp = (shared_header_t *)buf;
+ data = (char *)buf + sizeof(*hdrp);
+ swab_shared_header_t(hdrp);
+ datalen = hdrp->h_length;
+
+ if (header_verify(hdrp, data, len)) {
+#if 0
+ fprintf(stderr, "diskRawReadShadow: bad CRC32, "
+ "fd = %d offset = %d len = %d\n", fd,
+ (int) readOffset, len);
+#endif
+ errno = EPROTO;
+ return -1;
+ }
+
+ return 0;
+}
+
+
+/*
+ * The RAW IO implementation requires buffers to be 512 byte aligned.
+ * Here we check for alignment and do a bounceio if necessary.
+ */
+static int
+diskRawRead(int fd, char *buf, int len)
+{
+ char *alignedBuf;
+ int readret;
+ int extraLength;
+ int readlen;
+ int bounceNeeded = 1;
+
+ if ((((unsigned long) buf & (unsigned long) 0x3ff) == 0) &&
+ ((len % 512) == 0)) {
+ bounceNeeded = 0;
+ }
+
+ if (bounceNeeded == 0) {
+ /* Already aligned and even multiple of 512, no bounceio
+ * required. */
+ return (read(fd, buf, len));
+ }
+
+ if (len > 512) {
+ fprintf(stderr,
+ "diskRawRead: not setup for reads larger than %d.\n",
+ 512);
+ return (-1);
+ }
+ /*
+ * All IOs must be of size which is a multiple of 512. Here we
+ * just add in enough extra to accommodate.
+ * XXX - if the on-disk offsets don't provide enough room we're cooked!
+ */
+ extraLength = 0;
+ if (len % 512) {
+ extraLength = 512 - (len % 512);
+ }
+
+ readlen = len;
+ if (extraLength) {
+ readlen += extraLength;
+ }
+
+ readret = posix_memalign((void **)&alignedBuf, 512, 512);
+ if (readret < 0) {
+ return -1;
+ }
+
+ readret = read(fd, alignedBuf, readlen);
+ if (readret > 0) {
+ if (readret > len) {
+ bcopy(alignedBuf, buf, len);
+ readret = len;
+ } else {
+ bcopy(alignedBuf, buf, readret);
+ }
+ }
+
+ free(alignedBuf);
+ if (readret != len) {
+ fprintf(stderr, "diskRawRead: read err, len=%d, readret=%d\n",
+ len, readret);
+ }
+
+ return (readret);
+}
+
+
+/*
+ * The RAW IO implementation requires buffers to be 512 byte aligned.
+ * Here we check for alignment and do a bounceio if necessary.
+ */
+static int
+diskRawWrite(int fd, char *buf, int len)
+{
+ char *alignedBuf;
+ int ret;
+ int extraLength;
+ int writelen;
+ int bounceNeeded = 1;
+
+ if ((((unsigned long) buf & (unsigned long) 0x3ff) == 0) &&
+ ((len % 512) == 0)) {
+ bounceNeeded = 0;
+ }
+ if (bounceNeeded == 0) {
+ /* Already aligned and even multiple of 512, no bounceio
+ * required. */
+ return (write(fd, buf, len));
+ }
+
+ if (len > 512) {
+ fprintf(stderr,
+ "diskRawWrite: not setup for larger than %d.\n",
+ 512);
+ return (-1);
+ }
+
+ /*
+ * All IOs must be of size which is a multiple of 512. Here we
+ * just add in enough extra to accommodate.
+ * XXX - if the on-disk offsets don't provide enough room we're cooked!
+ */
+ extraLength = 0;
+ if (len % 512) {
+ extraLength = 512 - (len % 512);
+ }
+
+ writelen = len;
+ if (extraLength) {
+ writelen += extraLength;
+ }
+
+ ret = posix_memalign((void **)&alignedBuf, 512,512);
+ if (ret < 0) {
+ return (-1);
+ }
+
+ bcopy(buf, alignedBuf, len);
+ ret = write(fd, alignedBuf, writelen);
+ if (ret > len) {
+ ret = len;
+ }
+
+ free(alignedBuf);
+ if (ret != len) {
+ fprintf(stderr, "diskRawWrite: write err, len=%d, ret=%dn",
+ len, ret);
+ }
+
+ return (ret);
+}
+
+
+static int
+diskRawWriteShadow(int fd, __off64_t writeOffset, char *buf, int len)
+{
+ off_t retval_seek;
+ ssize_t retval_write;
+
+ if ((writeOffset < 0) || (len < 0)) {
+ fprintf(stderr,
+ "diskRawWriteShadow: writeOffset=%08x, "
+ "len=%08x.\n", (int)writeOffset, len);
+ return (-1);
+ }
+
+ retval_seek = lseek(fd, writeOffset, SEEK_SET);
+ if (retval_seek != writeOffset) {
+ fprintf(stderr,
+ "diskRawWriteShadow: can't seek to offset %d\n",
+ (int) writeOffset);
+ return (-1);
+ }
+
+ retval_write = diskRawWrite(fd, buf, len);
+ if (retval_write != len) {
+ if (retval_write == -1) {
+ fprintf(stderr, "%s: %s\n", __FUNCTION__,
+ strerror(errno));
+ }
+ fprintf(stderr,
+ "diskRawWriteShadow: aligned write returned %d"
+ ", not %d\n", (int)retval_write, (int)len);
+ return (-1);
+ }
+
+ return 0;
+}
+
+
+int
+qdisk_read(int fd, __off64_t offset, void *buf, int count)
+{
+ shared_header_t *hdrp;
+ char *data;
+ size_t total;
+ int rv;
+
+ /*
+ * Calculate the total length of the buffer, including the header.
+ * Raw blocks are 512 byte aligned.
+ */
+ total = count + sizeof(shared_header_t);
+ if (total < 512)
+ total = 512;
+
+ /* Round it up */
+ if (total % 512)
+ total = total + (512 * !!(total % 512)) - (total % 512);
+
+ hdrp = NULL;
+ rv = posix_memalign((void **)&hdrp, sysconf(_SC_PAGESIZE), total);
+ if (rv < 0)
+ return -1;
+
+ if (hdrp == NULL)
+ return -1;
+
+ data = (char *)hdrp + sizeof(shared_header_t);
+
+ rv = diskRawReadShadow(fd, offset, (char *)hdrp, total);
+
+ if (rv == -1) {
+ return -1;
+ }
+
+ /* Copy out the data */
+ memcpy(buf, data, hdrp->h_length);
+
+ /* Zero out the remainder. */
+ if (hdrp->h_length < count) {
+ memset(buf + hdrp->h_length, 0,
+ count - hdrp->h_length);
+ }
+
+ free(hdrp);
+ return count;
+}
+
+
+int
+qdisk_write(int fd, __off64_t offset, const void *buf, int count)
+{
+ size_t maxsize;
+ shared_header_t *hdrp;
+ char *data;
+ size_t total = 0, rv = -1, psz = 512; //sysconf(_SC_PAGESIZE);
+
+ maxsize = psz - (sizeof(shared_header_t));
+ if (count >= (maxsize + sizeof(shared_header_t))) {
+ printf("error: count %d >= (%d + %d)\n", (int)count,
+ (int)maxsize, (int)sizeof(shared_header_t));
+ errno = ENOSPC;
+ return -1;
+ }
+
+ /*
+ * Calculate the total length of the buffer, including the header.
+ * Raw blocks are 512 byte aligned.
+ */
+ total = count + sizeof(shared_header_t);
+ if (total < psz)
+ total = psz;
+
+ /* Round it up */
+ if (total % psz)
+ total = total + (psz * !!(total % psz)) - (total % psz);
+
+ hdrp = NULL;
+ rv = posix_memalign((void **)&hdrp, sysconf(_SC_PAGESIZE), total);
+ if (rv < 0) {
+ perror("posix_memalign");
+ return -1;
+ }
+
+ /*
+ * Copy the data into our new buffer
+ */
+ data = (char *)hdrp + sizeof(shared_header_t);
+ memcpy(data, buf, count);
+
+ if (header_generate(hdrp, buf, count) == -1) {
+ free((char *)hdrp);
+ return -1;
+ }
+ swab_shared_header_t(hdrp);
+
+ /*
+ * Locking must be performed elsewhere. We make no assumptions
+ * about locking here.
+ */
+ if (total == psz)
+ rv = diskRawWriteShadow(fd, offset, (char *)hdrp, psz);
+
+ if (rv == -1)
+ perror("diskRawWriteShadow");
+
+ free((char *)hdrp);
+ if (rv == -1)
+ return -1;
+ return count;
+}
+
+
+static int
+header_init(int fd, char *label)
+{
+ quorum_header_t qh;
+
+ if (qdisk_read(fd, OFFSET_HEADER, &qh, sizeof(qh)) == sizeof(qh)) {
+ swab_quorum_header_t(&qh);
+ if (qh.qh_magic == HEADER_MAGIC_OLD) {
+ printf("Warning: Red Hat Cluster Manager 1.2.x "
+ "header found\n");
+ } else if (qh.qh_magic == HEADER_MAGIC_NUMBER) {
+ printf("Warning: Initializing previously "
+ "initialized partition\n");
+ }
+ }
+
+ if (gethostname(qh.qh_updatehost, sizeof(qh.qh_updatehost)) < 0) {
+ perror("gethostname");
+ return -1;
+ }
+
+ /* Copy in the cluster/label name */
+ snprintf(qh.qh_cluster, sizeof(qh.qh_cluster)-1, label);
+
+ if ((qh.qh_timestamp = (uint64_t)time(NULL)) <= 0) {
+ perror("time");
+ return -1;
+ }
+
+ qh.qh_magic = HEADER_MAGIC_NUMBER;
+ swab_quorum_header_t(&qh);
+ if (qdisk_write(fd, OFFSET_HEADER, &qh, sizeof(qh)) != sizeof(qh)) {
+ return -1;
+ }
+
+ return 0;
+}
+
+
+int
+qdisk_init(char *partname, char *label)
+{
+ int fd;
+ status_block_t ps, wps;
+ int nid;
+ time_t t;
+
+ fd = qdisk_validate(partname);
+ if (fd < 0) {
+ perror("qdisk_verify");
+ return -1;
+ }
+
+ fd = qdisk_open(partname);
+ if (fd < 0) {
+ perror("qdisk_open");
+ return -1;
+ }
+
+ if (header_init(fd, label) < 0) {
+ return -1;
+ }
+
+ time(&t);
+
+ ps.ps_magic = STATE_MAGIC_NUMBER;
+ ps.ps_updatenode = 0;
+ ps.pad0 = 0;
+ ps.ps_timestamp = (uint64_t)t;
+ ps.ps_state = (uint8_t)S_NONE;
+ ps.pad1[0] = 0;
+ ps.ps_flags = 0;
+ ps.ps_score = 0;
+ ps.ps_scoremax = 0;
+ ps.ps_ca_sec = 0;
+ ps.ps_ca_usec = 0;
+ ps.ps_lc_sec = 0;
+ ps.ps_ca_usec = 0;
+
+ /* Node IDs 1..N */
+ for (nid = 1; nid <= MAX_NODES_DISK; nid++) {
+ ps.ps_nodeid = nid;
+
+ printf("Initializing status block for node %d...\n", nid);
+ wps = ps;
+ swab_status_block_t(&wps);
+
+ if (qdisk_write(fd, qdisk_nodeid_offset(nid), &wps, sizeof(wps)) < 0) {
+ printf("Error writing node ID block %d\n", nid);
+ qdisk_close(&fd);
+ return -1;
+ }
+ }
+
+ qdisk_close(&fd);
+
+ return 0;
+}
+
/cvs/cluster/cluster/cman/qdisk/disk.h,v --> standard output
revision 1.3.2.1
--- cluster/cman/qdisk/disk.h
+++ - 2006-07-21 18:01:39.865378000 +0000
@@ -0,0 +1,269 @@
+/**
+ Copyright Red Hat, Inc. 2006
+
+ This program is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by the
+ Free Software Foundation; either version 2, or (at your option) any
+ later version.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; see the file COPYING. If not, write to the
+ Free Software Foundation, Inc., 675 Mass Ave, Cambridge,
+ MA 02139, USA.
+
+ Author: Lon Hohberger <lhh at redhat.com>
+ */
+/**
+ @file Main quorum daemon include file
+ */
+#ifndef _QUORUM_DISK_H
+#define _QUORUM_DISK_H
+
+#include <stdint.h>
+#include <pthread.h>
+#include <arpa/inet.h>
+#include <libcman.h>
+
+#define MAX_NODES_DISK 16
+#define MEMB_MASK_LEN ((MAX_NODES_DISK / 8) + \
+ (!!(MAX_NODES_DISK % 8)))
+#define DISK_MEMB_MASK_LEN ((MEMB_MASK_LEN + 7) & ~7)
+
+/** The membership bitmask type */
+typedef uint8_t memb_mask_t [DISK_MEMB_MASK_LEN];
+
+typedef enum {
+ S_NONE = 0x0, // Shutdown / not quorate / not running
+ S_EVICT = 0x1, // Voted out / about to be fenced.
+ /* ^^^ Fencing OK */
+ S_INIT = 0x2, // Initializing. Hold your fire.
+ /* vvv Fencing will kill a node */
+ S_RUN = 0x5, // I think I'm running.
+ S_MASTER= 0x6 // I know I'm running, and have advertised to
+ // CMAN the availability of the disk vote for my
+ // partition.
+} disk_node_state_t;
+
+
+typedef enum {
+ M_NONE = 0x0,
+ M_BID = 0x1,
+ M_ACK = 0x2,
+ M_NACK = 0x3,
+ M_MASK = 0x4
+} disk_msg_id_t;
+
+
+typedef enum {
+ FL_MSG = 0x1,
+ FL_BID = 0x2,
+ FL_VOTE = 0x4
+} disk_state_flag_t;
+
+
+/* RHEL 2.1 / RHCS3 old magic numbers */
+#define HEADER_MAGIC_OLD 0x39119FCD /* partition header */
+#define STATE_MAGIC_OLD 0xF1840DCE /* Status block */
+#define SHARED_HEADER_MAGIC_OLD 0x00DEBB1E /* Per-block header */
+
+/* Conversion */
+#define HEADER_MAGIC_NUMBER 0xeb7a62c2 /* Partition header */
+#define STATE_MAGIC_NUMBER 0x47bacef8 /* Status block */
+#define SHARED_HEADER_MAGIC 0x00DEBB1E /* Per-block headeer */
+
+
+typedef struct __attribute__ ((packed)) {
+ uint32_t ps_magic;
+ /* 4 */
+ uint32_t ps_updatenode; // Last writer
+ /* 8 */
+ uint64_t ps_timestamp; // time of last update
+ /* 16 */
+ uint32_t ps_nodeid;
+ uint32_t pad0;
+ /* 24 */
+ uint8_t ps_state; // running or stopped
+ uint8_t pad1[1];
+ uint16_t ps_flags;
+ /* 26 */
+ uint16_t ps_score; // Local points
+ uint16_t ps_scoremax; // What we think is our max
+ // points, if other nodes
+ // disagree, we may be voted
+ // out
+ /* 28 */
+ uint32_t ps_ca_sec; // Cycle speed (average)
+ uint32_t ps_ca_usec;
+ /* 36 */
+ uint32_t ps_lc_sec; // Cycle speed (last)
+ uint32_t ps_lc_usec;
+ uint64_t ps_incarnation; // Token to detect hung +
+ // restored node
+ /* 44 */
+ uint16_t ps_msg; // Vote/bid mechanism
+ uint16_t ps_seq;
+ uint32_t ps_arg;
+ /* 52 */
+ memb_mask_t ps_mask; // Bitmap
+ memb_mask_t ps_master_mask; // Bitmap
+ /* 60 */
+} status_block_t;
+
+#define swab_status_block_t(ptr) \
+{\
+ swab32((ptr)->ps_magic);\
+ swab32((ptr)->ps_updatenode);\
+ swab64((ptr)->ps_timestamp);\
+ swab32((ptr)->ps_nodeid);\
+ swab32((ptr)->pad0);\
+ /* state + pad */ \
+ swab16((ptr)->ps_flags);\
+ swab16((ptr)->ps_score);\
+ swab16((ptr)->ps_scoremax);\
+ /* Cycle speeds */ \
+ swab32((ptr)->ps_ca_sec);\
+ swab32((ptr)->ps_ca_usec);\
+ swab32((ptr)->ps_lc_sec);\
+ swab32((ptr)->ps_lc_usec);\
+ /* Message */ \
+ swab16((ptr)->ps_msg); \
+ swab16((ptr)->ps_seq); \
+ swab32((ptr)->ps_arg); \
+ }
+
+
+/*
+ * Shared state disk header. Describes cluster global information.
+ */
+typedef struct __attribute__ ((packed)) {
+ uint32_t qh_magic;
+ uint32_t qh_align; // 64-bit-ism: alignment fixer.
+ uint64_t qh_timestamp; // time of last update
+ char qh_updatehost[128];// Hostname who put this here...
+ char qh_cluster[128]; // Cluster name
+} quorum_header_t;
+
+#define swab_quorum_header_t(ptr) \
+{\
+ swab32((ptr)->qh_magic); \
+ swab32((ptr)->qh_align); \
+ swab64((ptr)->qh_timestamp); \
+}
+
+
+
+/*
+ * The user data is stored with this header prepended.
+ * The header ONLY contains CRC information and the length of the data.
+ * The data blocks themselves contain their own respective magic numbers.
+ */
+typedef struct __attribute__ ((packed)) {
+ uint32_t h_magic; /* Header magic */
+ uint32_t h_hcrc; /* Header CRC */
+ uint32_t h_dcrc; /* CRC32 of data */
+ uint32_t h_length; /* Length of real data */
+ uint64_t h_view; /* View # of real data */
+ uint64_t h_timestamp; /* Timestamp */
+} shared_header_t;
+
+#define SHARED_HEADER_INITIALIZER = {0, 0, 0, 0, 0, 0}
+
+#define swab_shared_header_t(ptr) \
+{\
+ swab32((ptr)->h_magic);\
+ swab32((ptr)->h_hcrc);\
+ swab32((ptr)->h_dcrc);\
+ swab32((ptr)->h_length);\
+ swab64((ptr)->h_view);\
+ swab64((ptr)->h_timestamp);\
+}
+
+
+/* Offsets from RHCM 1.2.x */
+#define OFFSET_HEADER 0
+#define HEADER_SIZE 4096 /* Page size for now */
+
+#define OFFSET_FIRST_STATUS_BLOCK (OFFSET_HEADER + HEADER_SIZE)
+#define SPACE_PER_STATUS_BLOCK 4096 /* Page size for now */
+#define STATUS_BLOCK_COUNT MAX_NODES_DISK
+
+#define SPACE_PER_MESSAGE_BLOCK (4096)
+#define MESSAGE_BLOCK_COUNT MAX_NODES_DISK
+
+#define END_OF_DISK (OFFSET_FIRST_STATUS_BLOCK + \
+ (MAX_NODES_DISK + 1) * \
+ SPACE_PER_STATUS_BLOCK) \
+
+
+
+/* From disk.c */
+int qdisk_open(char *name);
+int qdisk_close(int *fd);
+int qdisk_init(char *name, char *clustername);
+int qdisk_validate(char *name);
+int qdisk_read(int fd, __off64_t ofs, void *buf, int len);
+int qdisk_write(int fd, __off64_t ofs, const void *buf, int len);
+
+#define qdisk_nodeid_offset(nodeid) \
+ (OFFSET_FIRST_STATUS_BLOCK + (SPACE_PER_STATUS_BLOCK * (nodeid - 1)))
+
+/* From disk_utils.c */
+#define HISTORY_LENGTH 60
+typedef struct {
+ disk_msg_id_t m_msg; /* this is an int, but will be stored as 16bit*/
+ uint32_t m_arg;
+ uint16_t m_seq;
+ uint16_t pad0;
+} disk_msg_t;
+
+typedef struct {
+ uint64_t qc_incarnation;
+ struct timeval qc_average;
+ struct timeval qc_last[HISTORY_LENGTH];
+ int qc_fd;
+ int qc_my_id;
+ int qc_writes;
+ int qc_interval;
+ int qc_tko;
+ int qc_votes;
+ int qc_scoremin;
+ disk_node_state_t qc_disk_status;
+ disk_node_state_t qc_status;
+ int qc_master; /* Master?! */
+ int qc_unused;
+ cman_handle_t qc_ch;
+ char *qc_device;
+ char *qc_label;
+ char *qc_status_file;
+} qd_ctx;
+
+typedef struct {
+ uint64_t ni_incarnation;
+ uint64_t ni_evil_incarnation;
+ time_t ni_last_seen;
+ int ni_misses;
+ int ni_seen;
+ disk_msg_t ni_msg;
+ disk_msg_t ni_last_msg;
+ disk_node_state_t ni_state;
+ status_block_t ni_status;
+} node_info_t;
+
+int qd_write_status(qd_ctx *ctx, int nid, disk_node_state_t state,
+ disk_msg_t *msg, memb_mask_t mask, memb_mask_t master);
+int qd_read_print_status(int fd, int nid);
+int qd_init(qd_ctx *ctx, cman_handle_t ch, int me);
+void qd_destroy(qd_ctx *ctx);
+
+/* proc.c */
+int find_partitions(const char *partfile, const char *label,
+ char *devname, size_t devlen, int print);
+int check_device(char *device, char *label, quorum_header_t *qh);
+
+
+#endif
/cvs/cluster/cluster/cman/qdisk/disk_util.c,v --> standard output
revision 1.2.2.1
--- cluster/cman/qdisk/disk_util.c
+++ - 2006-07-21 18:01:39.967981000 +0000
@@ -0,0 +1,293 @@
+/**
+ Copyright Red Hat, Inc. 2006
+
+ This program is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by the
+ Free Software Foundation; either version 2, or (at your option) any
+ later version.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; see the file COPYING. If not, write to the
+ Free Software Foundation, Inc., 675 Mass Ave, Cambridge,
+ MA 02139, USA.
+
+ Author: Lon Hohberger <lhh at redhat.com>
+ */
+/**
+ @file Misc. Quorum daemon context utilities / high-level functions
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/ioctl.h>
+#include <string.h>
+#include <errno.h>
+#include <disk.h>
+#include <platform.h>
+#include <unistd.h>
+#include <sys/time.h>
+#include <time.h>
+
+
+static inline void
+_diff_tv(struct timeval *dest, struct timeval *start, struct timeval *end)
+{
+ dest->tv_sec = end->tv_sec - start->tv_sec;
+ dest->tv_usec = end->tv_usec - start->tv_usec;
+
+ if (dest->tv_usec < 0) {
+ dest->tv_usec += 1000000;
+ dest->tv_sec--;
+ }
+}
+
+
+/**
+ Update write times and calculate a new average time
+ */
+void
+qd_update_wtime(qd_ctx *ctx, struct timeval *newtime)
+{
+ int x;
+ int max = HISTORY_LENGTH;
+ uint64_t sum = 0;
+
+ /* Store the thing */
+ ctx->qc_writes++;
+ ctx->qc_last[ctx->qc_writes % HISTORY_LENGTH].tv_sec = newtime->tv_sec;
+ ctx->qc_last[ctx->qc_writes % HISTORY_LENGTH].tv_usec = newtime->tv_usec;
+
+ if (ctx->qc_writes < HISTORY_LENGTH)
+ max = ctx->qc_writes;
+
+ for (x = 0; x < max; x++) {
+ sum += (ctx->qc_last[x].tv_sec * 1000000);
+ sum += ctx->qc_last[x].tv_usec;
+ }
+
+ sum /= max;
+
+ ctx->qc_average.tv_sec = (sum / 1000000);
+ ctx->qc_average.tv_usec = (sum % 1000000);
+}
+
+
+/**
+ Write a status block to disk, given state, nodeid, message, and the
+ membership mask.
+ */
+int
+qd_write_status(qd_ctx *ctx, int nid, disk_node_state_t state,
+ disk_msg_t *msg, memb_mask_t mask, memb_mask_t master)
+{
+ status_block_t ps;
+ struct timeval start, end;
+ int utime_ok = 1;
+
+ if (!ctx) {
+ errno = EINVAL;
+ return -1;
+ }
+
+ if (nid <= 0) {
+ errno = EINVAL;
+ return -1;
+ }
+
+ ps.ps_magic = STATE_MAGIC_NUMBER;
+ ps.ps_nodeid = nid;
+ ps.ps_updatenode = ctx->qc_my_id;
+ ps.pad0 = 0;
+ ps.ps_timestamp = (uint64_t)time(NULL);
+ ps.ps_state = (uint8_t)state;
+ ps.pad1[0] = 0;
+ ps.ps_flags = 0;
+ ps.ps_score = 0;
+ ps.ps_scoremax = 0;
+ ps.ps_ca_sec = ctx->qc_average.tv_sec;
+ ps.ps_ca_usec = ctx->qc_average.tv_usec;
+ ps.ps_incarnation = ctx->qc_incarnation;
+ if (mask) {
+ memcpy(ps.ps_mask, mask, sizeof(memb_mask_t));
+ } else {
+ memset(ps.ps_mask, 0, sizeof(memb_mask_t));
+ }
+ if (master) {
+ memcpy(ps.ps_master_mask, master, sizeof(memb_mask_t));
+ } else {
+ memset(ps.ps_master_mask, 0, sizeof(memb_mask_t));
+ }
+
+ if (ctx->qc_writes) {
+ ps.ps_lc_sec =
+ ctx->qc_last[(ctx->qc_writes - 1) % HISTORY_LENGTH].tv_sec;
+ ps.ps_lc_usec =
+ ctx->qc_last[(ctx->qc_writes - 1) % HISTORY_LENGTH].tv_usec;
+ } else {
+ ps.ps_lc_sec = ps.ps_lc_usec = 0;
+ }
+ ps.ps_nodeid = nid;
+
+ /* Argh! */
+ if (msg) {
+ ps.ps_msg = msg->m_msg;
+ ps.ps_seq = msg->m_seq;
+ ps.ps_arg = msg->m_arg;
+ } else {
+ ps.ps_msg = 0;
+ ps.ps_seq = 0;
+ ps.ps_arg = 0;
+ }
+
+ if (gettimeofday(&start, NULL) < 0)
+ utime_ok = 0;
+ swab_status_block_t(&ps);
+ if (qdisk_write(ctx->qc_fd, qdisk_nodeid_offset(nid), &ps,
+ sizeof(ps)) < 0) {
+ printf("Error writing node ID block %d\n", nid);
+ return -1;
+ }
+ if (utime_ok && (gettimeofday(&end, NULL) < 0))
+ utime_ok = 0;
+
+ if (utime_ok) {
+ _diff_tv(&start,&start,&end);
+ } else {
+ /* Use heuristic */
+ start.tv_sec = ctx->qc_average.tv_sec;
+ start.tv_usec = ctx->qc_average.tv_usec;
+ }
+ qd_update_wtime(ctx, &start);
+
+ return 0;
+}
+
+
+int
+qd_print_status(status_block_t *ps)
+{
+ int x;
+
+ printf("Data @ offset %d:\n",
+ (int)qdisk_nodeid_offset(ps->ps_nodeid));
+ printf("status_block_t {\n");
+ printf("\t.ps_magic = %08x;\n", (int)ps->ps_magic);
+ printf("\t.ps_nodeid = %d;\n", (int)ps->ps_nodeid);
+ printf("\t.ps_updatenode = %d;\n", (int)ps->ps_updatenode);
+ printf("\t.pad0 = %d;\n", (int)ps->pad0);
+ printf("\t.ps_timestamp = %llu;\n", (long long unsigned)
+ ps->ps_timestamp);
+ printf("\t.ps_state = %d;\n", ps->ps_state);
+ printf("\t.pad1[0] = %d;\n", ps->pad1[0]);
+ printf("\t.ps_flags = %d;\n", ps->ps_flags);
+ printf("\t.ps_score = %d;\n", ps->ps_score);
+ printf("\t.ps_scoremax = %d;\n", ps->ps_scoremax);
+ printf("\t.ps_ca_sec = %d;\n", ps->ps_ca_sec);
+ printf("\t.ps_ca_usec = %d;\n", ps->ps_ca_usec);
+ printf("\t.ps_lc_sec = %d;\n", ps->ps_lc_sec);
+ printf("\t.ps_lc_usec = %d;\n", ps->ps_lc_usec);
+ printf("\t.ps_mask = 0x");
+ for (x = (sizeof(memb_mask_t)-1); x >= 0; x--)
+ printf("%02x", ps->ps_mask[x]);
+ printf("\n");
+ printf("\t.ps_master_mask = 0x");
+ for (x = (sizeof(memb_mask_t)-1); x >= 0; x--)
+ printf("%02x", ps->ps_mask[x]);
+ printf("\n");
+
+ printf("}\n");
+
+ return 0;
+}
+
+
+int
+qd_read_print_status(int fd, int nid)
+{
+ status_block_t ps;
+
+ if (fd < 0) {
+ errno = EINVAL;
+ return -1;
+ }
+
+ if (nid <= 0) {
+ errno = EINVAL;
+ return -1;
+ }
+
+ if (qdisk_read(fd, qdisk_nodeid_offset(nid), &ps,
+ sizeof(ps)) < 0) {
+ printf("Error reading node ID block %d\n", nid);
+ return -1;
+ }
+ swab_status_block_t(&ps);
+ qd_print_status(&ps);
+
+ return 0;
+}
+
+
+/**
+ Generate a token based on the current system time.
+ */
+uint64_t
+generate_token(void)
+{
+ uint64_t my_token = 0;
+ struct timeval tv;
+
+ while(my_token == 0) {
+ gettimeofday(&tv, NULL);
+
+ my_token = ((uint64_t) (tv.tv_sec) << 32) |
+ (uint64_t) (tv.tv_sec & 0x00000000ffffffff);
+ }
+
+ return my_token;
+}
+
+
+/**
+ Initialize a quorum disk context, given a CMAN handle and a nodeid.
+ */
+int
+qd_init(qd_ctx *ctx, cman_handle_t ch, int me)
+{
+ if (!ctx || !ch || !me) {
+ errno = EINVAL;
+ return -1;
+ }
+
+ memset(ctx, 0, sizeof(*ctx));
+ ctx->qc_incarnation = generate_token();
+ ctx->qc_ch = ch;
+ ctx->qc_my_id = me;
+
+ return 0;
+}
+
+
+/**
+ Destroy a quorum disk context
+ */
+void
+qd_destroy(qd_ctx *ctx)
+{
+ if (ctx->qc_my_id == 0)
+ return;
+ if (ctx->qc_device) {
+ free(ctx->qc_device);
+ ctx->qc_device = NULL;
+ }
+ close(ctx->qc_fd);
+ ctx->qc_fd = -1;
+}
/cvs/cluster/cluster/cman/qdisk/gettid.c,v --> standard output
revision 1.4.2.1
--- cluster/cman/qdisk/gettid.c
+++ - 2006-07-21 18:01:40.062745000 +0000
@@ -0,0 +1,24 @@
+#include <sys/types.h>
+#include <sys/syscall.h>
+#include <linux/unistd.h>
+#include <gettid.h>
+#include <errno.h>
+#include <unistd.h>
+
+/* Patch from Adam Conrad / Ubuntu: Don't use _syscall macro */
+
+#ifdef __NR_gettid
+pid_t gettid (void)
+{
+ return syscall(__NR_gettid);
+}
+#else
+
+#warn "gettid not available -- substituting with pthread_self()"
+
+#include <pthread.h>
+pid_t gettid (void)
+{
+ return (pid_t)pthread_self();
+}
+#endif
/cvs/cluster/cluster/cman/qdisk/gettid.h,v --> standard output
revision 1.2.2.1
--- cluster/cman/qdisk/gettid.h
+++ - 2006-07-21 18:01:40.159682000 +0000
@@ -0,0 +1,7 @@
+#ifndef __GETTID_H
+#define __GETTID_H
+
+pid_t gettid(void);
+
+#endif
+
/cvs/cluster/cluster/cman/qdisk/main.c,v --> standard output
revision 1.3.2.1
--- cluster/cman/qdisk/main.c
+++ - 2006-07-21 18:01:40.252627000 +0000
@@ -0,0 +1,1026 @@
+/**
+ Copyright Red Hat, Inc. 2006
+
+ This program is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by the
+ Free Software Foundation; either version 2, or (at your option) any
+ later version.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; see the file COPYING. If not, write to the
+ Free Software Foundation, Inc., 675 Mass Ave, Cambridge,
+ MA 02139, USA.
+
+ Author: Lon Hohberger <lhh at redhat.com>
+ */
+/**
+ @file Main loop / functions for disk-based quorum daemon.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/ioctl.h>
+#include <string.h>
+#include <errno.h>
+#include <disk.h>
+#include <platform.h>
+#include <unistd.h>
+#include <time.h>
+#include <sys/reboot.h>
+#include <linux/reboot.h>
+#include <signal.h>
+#include <ccs.h>
+#include "score.h"
+#include "clulog.h"
+/*
+ TODO:
+ 1) Take into account timings to gracefully extend node timeouts during
+ node spikes (that's why they are there!).
+ 2) Poll ccsd for configuration changes.
+ 3) Logging.
+ */
+
+/* From bitmap.c */
+int clear_bit(uint8_t *mask, uint32_t bitidx, uint32_t masklen);
+int set_bit(uint8_t *mask, uint32_t bitidx, uint32_t masklen);
+int is_bit_set(uint8_t *mask, uint32_t bitidx, uint32_t masklen);
+static int _running = 0;
+
+
+static void
+int_handler(int sig)
+{
+ _running = 0;
+}
+
+
+/**
+ Simple thing to see if a node is running.
+ */
+inline int
+state_run(disk_node_state_t state)
+{
+ return (state >= S_INIT ? state : 0);
+}
+
+
+/**
+ Clear out / initialize node info block.
+ */
+void
+node_info_init(node_info_t *ni, int max)
+{
+ int x;
+ time_t t = time(NULL);
+
+ memset(ni, 0, sizeof(*ni) * max);
+ for (x = 0; x < max; x++) {
+ ni[x].ni_status.ps_nodeid = (x + 1); /* node ids are 1-based */
+ ni[x].ni_status.ps_timestamp = t;
+ ni[x].ni_misses = 0;
+ ni[x].ni_last_seen = t;
+ }
+}
+
+
+/**
+ Check to see if someone tried to evict us but we were out to lunch.
+ Rare case; usually other nodes would put up the 'Undead' message and
+ re-evict us.
+ */
+void
+check_self(qd_ctx *ctx, status_block_t *sb)
+{
+ if (!sb->ps_updatenode ||
+ (sb->ps_updatenode == ctx->qc_my_id)) {
+ return;
+ }
+
+ /* I did not update this??! */
+ switch(sb->ps_state) {
+ case S_EVICT:
+ /* Someone told us to die. */
+ reboot(RB_AUTOBOOT);
+ default:
+ clulog(LOG_EMERG, "Unhandled state: %d\n", sb->ps_state);
+ raise(SIGSTOP);
+ }
+}
+
+
+/**
+ Read in the node blocks off of the quorum disk and see if anyone has
+ or has not updated their timestamp recently. See check_transitions as
+ well.
+ */
+void
+read_node_blocks(qd_ctx *ctx, node_info_t *ni, int max)
+{
+ int x;
+ status_block_t *sb;
+
+ for (x = 0; x < max; x++) {
+
+ sb = &ni[x].ni_status;
+
+ if (qdisk_read(ctx->qc_fd, qdisk_nodeid_offset(x+1),
+ sb, sizeof(*sb)) < 0) {
+ clulog(LOG_WARNING,"Error reading node ID block %d\n",
+ x+1);
+ }
+ swab_status_block_t(sb);
+
+ if (sb->ps_nodeid == ctx->qc_my_id) {
+ check_self(ctx, sb);
+ continue;
+ }
+ /* message. */
+ ni[x].ni_msg.m_arg = sb->ps_arg;
+ ni[x].ni_msg.m_msg = sb->ps_msg;
+ ni[x].ni_msg.m_seq = sb->ps_seq;
+
+ if (!state_run(sb->ps_state))
+ continue;
+
+ /* Unchanged timestamp: miss */
+ if (sb->ps_timestamp == ni[x].ni_last_seen) {
+ /* XXX check for average + allow grace */
+ ni[x].ni_misses++;
+ continue;
+ }
+
+ /* Got through? The node is good. */
+ ni[x].ni_misses = 0;
+ ni[x].ni_seen++;
+ ni[x].ni_last_seen = sb->ps_timestamp;
+ }
+}
+
+
+/**
+ Check for node transitions.
+ */
+void
+check_transitions(qd_ctx *ctx, node_info_t *ni, int max, memb_mask_t mask)
+{
+ int x;
+
+ if (mask)
+ memset(mask, 0, sizeof(memb_mask_t));
+
+ for (x = 0; x < max; x++) {
+
+ /*
+ Case 1: check to see if the node is still up
+ according to our internal state, but has been
+ evicted by the master or cleanly shut down
+ (or restarted).
+
+ Transition from Evicted/Shutdown -> Offline
+ */
+ if ((ni[x].ni_state >= S_EVICT &&
+ ni[x].ni_status.ps_state <= S_EVICT) ||
+ (ni[x].ni_incarnation &&
+ (ni[x].ni_incarnation !=
+ ni[x].ni_status.ps_incarnation))) {
+
+ if (ni[x].ni_status.ps_state == S_EVICT) {
+ clulog(LOG_NOTICE, "Node %d evicted\n",
+ ni[x].ni_status.ps_nodeid);
+ } else {
+ /* State == S_NONE or incarnation change */
+ clulog(LOG_INFO, "Node %d shutdown\n",
+ ni[x].ni_status.ps_nodeid);
+ ni[x].ni_evil_incarnation = 0;
+ }
+
+ ni[x].ni_incarnation = 0;
+ ni[x].ni_seen = 0;
+ ni[x].ni_misses = 0;
+ ni[x].ni_state = S_NONE;
+
+ continue;
+ }
+
+ /*
+ Case 2: Check for a heartbeat timeout. Write an eviction
+ notice if we're the master. If this is our first notice
+ of the heartbeat timeout, update our internal state
+ accordingly. When the master evicts this node, we will
+ hit case 1 above.
+
+ Transition from Online -> Evicted
+ */
+ if (ni[x].ni_misses > ctx->qc_tko &&
+ state_run(ni[x].ni_status.ps_state)) {
+
+ /*
+ Write eviction notice if we're the master.
+ */
+ if (ctx->qc_status == S_MASTER) {
+ clulog(LOG_DEBUG,
+ "Writing eviction notice for node %d\n",
+ ni[x].ni_status.ps_nodeid);
+ qd_write_status(ctx, ni[x].ni_status.ps_nodeid,
+ S_EVICT, NULL, NULL, NULL);
+ clulog(LOG_DEBUG,
+ "Telling CMAN to kill the node\n");
+ cman_kill_node(ctx->qc_ch,
+ ni[x].ni_status.ps_nodeid);
+ }
+
+ /*
+ Mark our internal views as dead if nodes miss too
+ many heartbeats... This will cause a master
+ transition if no live master exists.
+ */
+ if (ni[x].ni_status.ps_state >= S_RUN &&
+ ni[x].ni_seen) {
+ clulog(LOG_DEBUG, "Node %d DOWN\n",
+ ni[x].ni_status.ps_nodeid);
+ ni[x].ni_seen = 0;
+ }
+
+ ni[x].ni_state = S_EVICT;
+ ni[x].ni_status.ps_state = S_EVICT;
+ ni[x].ni_evil_incarnation =
+ ni[x].ni_status.ps_incarnation;
+
+ continue;
+ }
+
+ /*
+ Case 3: Check for node who is supposed to be dead, but
+ has started writing to the disk again with the same
+ incarnation.
+
+ Transition from Offline -> Undead (BAD!!!)
+ */
+ if (ni[x].ni_evil_incarnation &&
+ (ni[x].ni_evil_incarnation ==
+ ni[x].ni_status.ps_incarnation)) {
+ clulog(LOG_CRIT, "Node %d is undead.\n",
+ ni[x].ni_status.ps_nodeid);
+
+ clulog(LOG_ALERT,
+ "Writing eviction notice for node %d\n",
+ ni[x].ni_status.ps_nodeid);
+ qd_write_status(ctx, ni[x].ni_status.ps_nodeid,
+ S_EVICT, NULL, NULL, NULL);
+ ni[x].ni_status.ps_state = S_EVICT;
+
+ /* XXX Need to fence it again */
+ clulog(LOG_DEBUG, "Telling CMAN to kill the node\n");
+ cman_kill_node(ctx->qc_ch,
+ ni[x].ni_status.ps_nodeid);
+ continue;
+ }
+
+
+ /*
+ Case 4: Check for a node who has met our minimum # of
+ 'seen' requests.
+
+ Transition from Offline -> Online
+ */
+ if (ni[x].ni_seen > (ctx->qc_tko / 2) &&
+ !state_run(ni[x].ni_state)) {
+ /*
+ Node-join - everyone just kind of "agrees"
+ there's no consensus to just have a node join
+ right now.
+ */
+ ni[x].ni_state = S_RUN;
+ clulog(LOG_DEBUG, "Node %d is UP\n",
+ ni[x].ni_status.ps_nodeid);
+ ni[x].ni_incarnation =
+ ni[x].ni_status.ps_incarnation;
+ if (mask)
+ set_bit(mask, (ni[x].ni_status.ps_nodeid-1),
+ sizeof(memb_mask_t));
+
+ continue;
+ }
+
+ /*
+ Case 5: Check for a node becoming master. Not really a
+ transition.
+ */
+ if (ni[x].ni_state == S_RUN &&
+ ni[x].ni_status.ps_state == S_MASTER) {
+ clulog(LOG_INFO, "Node %d is the master\n",
+ ni[x].ni_status.ps_nodeid);
+ ni[x].ni_state = S_MASTER;
+ if (mask)
+ set_bit(mask, (ni[x].ni_status.ps_nodeid-1),
+ sizeof(memb_mask_t));
+ continue;
+ }
+
+ /*
+ All other cases: Believe the node's reported state ;)
+ */
+ if (state_run(ni[x].ni_state)) {
+ ni[x].ni_state = ni[x].ni_status.ps_state;
+ if (mask)
+ set_bit(mask, (ni[x].ni_status.ps_nodeid-1),
+ sizeof(memb_mask_t));
+ }
+ }
+}
+
+
+/**
+ Checks for presence of an online master. If there is no
+ Returns
+ */
+int
+master_exists(qd_ctx *ctx, node_info_t *ni, int max, int *low_id)
+{
+ int x;
+ int masters = 0;
+ int ret = 0;
+
+ *low_id = ctx->qc_my_id;
+
+ for (x = 0; x < max; x++) {
+
+ /* See if this one's a master */
+ if (ni[x].ni_state >= S_RUN &&
+ ni[x].ni_status.ps_state == S_MASTER) {
+ if (!ret)
+ ret = ni[x].ni_status.ps_nodeid;
+ ++masters;
+ }
+
+ /* See if it's us... */
+ if (ni[x].ni_status.ps_nodeid == ctx->qc_my_id &&
+ ni[x].ni_status.ps_state == S_MASTER) {
+ if (!ret)
+ ret = ctx->qc_my_id;
+ ++masters;
+ continue;
+ }
+
+ /* Look for dead master */
+ if (ni[x].ni_state < S_RUN &&
+ ni[x].ni_status.ps_state == S_MASTER) {
+ clulog(LOG_DEBUG,
+ "Node %d is marked master, but is dead.\n",
+ ni[x].ni_status.ps_nodeid);
+ continue;
+ }
+
+ if (ni[x].ni_state < S_RUN)
+ continue;
+
+ if (ni[x].ni_status.ps_nodeid < *low_id)
+ *low_id = ni[x].ni_status.ps_nodeid;
+ }
+
+ if (masters > 1) {
+ clulog(LOG_CRIT,
+ "Critical Error: More than one master found!\n");
+ /* XXX Handle this how? */
+ }
+ /*
+ else if (masters == 1) {
+ printf("Node %d is the master\n", ret);
+ } else {
+ printf("No master found; node %d should be the master\n",
+ *low_id);
+ }
+ */
+
+ return ret;
+}
+
+
+/**
+ initialize node information blocks and wait to see if there is already
+ a cluster running using this QD. Note that this will delay master
+ election if multiple nodes start with a second or two of each other.
+ */
+int
+quorum_init(qd_ctx *ctx, node_info_t *ni, int max, struct h_data *h, int maxh)
+{
+ int x = 0, score, maxscore;
+
+ clulog(LOG_INFO, "Quorum Daemon Initializing\n");
+
+ if (qdisk_validate(ctx->qc_device) < 0)
+ return -1;
+
+ ctx->qc_fd = qdisk_open(ctx->qc_device);
+ if (ctx->qc_fd < 0) {
+ clulog(LOG_CRIT, "Failed to open %s: %s\n", ctx->qc_device,
+ strerror(errno));
+ return -1;
+ }
+
+ start_score_thread(h, maxh);
+
+ node_info_init(ni, max);
+ if (qd_write_status(ctx, ctx->qc_my_id,
+ S_INIT, NULL, NULL, NULL) != 0) {
+ clulog(LOG_CRIT, "Could not initialize status block!\n");
+ return -1;
+ }
+
+ while (++x <= ctx->qc_tko) {
+ read_node_blocks(ctx, ni, max);
+ check_transitions(ctx, ni, max, NULL);
+
+ if (qd_write_status(ctx, ctx->qc_my_id,
+ S_INIT, NULL, NULL, NULL) != 0) {
+ clulog(LOG_CRIT, "Initialization failed\n");
+ return -1;
+ }
+
+ sleep(ctx->qc_interval);
+
+ }
+
+ get_my_score(&score,&maxscore);
+ clulog(LOG_INFO, "Initial score %d/%d\n", score, maxscore);
+ clulog(LOG_INFO, "Initialization complete\n");
+
+ return 0;
+}
+
+
+/**
+ Vote for a master if it puts a bid in.
+ */
+void
+do_vote(qd_ctx *ctx, node_info_t *ni, int max, disk_msg_t *msg)
+{
+ int x;
+
+ for (x = 0; x < max; x++) {
+ if (ni[x].ni_state != S_RUN)
+ continue;
+
+ if (ni[x].ni_status.ps_msg == M_BID &&
+ ni[x].ni_status.ps_nodeid < ctx->qc_my_id) {
+
+ /* Vote for lowest bidding ID that is lower
+ than us */
+ msg->m_msg = M_ACK;
+ msg->m_arg = ni[x].ni_status.ps_nodeid;
+ msg->m_seq = ni[x].ni_status.ps_seq;
+
+ return;
+ }
+ }
+}
+
+
+/*
+ Check to match nodes in mask with nodes online according to CMAN.
+ Only the master needs to do this.
+ */
+void
+check_cman(qd_ctx *ctx, memb_mask_t mask, memb_mask_t master_mask)
+{
+ cman_node_t nodes[MAX_NODES_DISK];
+ int retnodes, x;
+
+ if (cman_get_nodes(ctx->qc_ch, MAX_NODES_DISK,
+ &retnodes, nodes) <0 )
+ return;
+
+ memset(master_mask, 0, sizeof(master_mask));
+
+ for (x = 0; x < retnodes; x++) {
+ if (is_bit_set(mask, nodes[x].cn_nodeid-1, sizeof(mask)) &&
+ nodes[x].cn_member)
+ set_bit(master_mask, nodes[x].cn_nodeid-1,
+ sizeof(master_mask));
+ }
+}
+
+
+/*
+ returns:
+ 3: all acks received - you are the master.
+ 2: nacked (not highest score?) might not happen
+ 1: other node with lower ID is bidding and we should rescind our
+ bid.
+ 0: still waiting; don't clear bid; just wait another round.
+ Modifies:
+ *msg - it will store the vote for the lowest bid if we should
+ clear our bid.
+ */
+int
+check_votes(qd_ctx *ctx, node_info_t *ni, int max, disk_msg_t *msg)
+{
+ int x, running = 0, acks = 0, nacks = 0, low_id = ctx->qc_my_id;
+
+ for (x = 0; x < max; x++) {
+ if (state_run(ni[x].ni_state))
+ ++running;
+ else
+ continue;
+
+ if (ni[x].ni_status.ps_msg == M_ACK &&
+ ni[x].ni_status.ps_arg == ctx->qc_my_id) {
+ ++acks;
+ }
+
+ if (ni[x].ni_status.ps_msg == M_NACK &&
+ ni[x].ni_status.ps_arg == ctx->qc_my_id) {
+ ++nacks;
+ }
+
+ /* If there's someone with a lower ID who is also
+ bidding for master, change our message to vote
+ for the lowest bidding node ID */
+ if (ni[x].ni_status.ps_msg == M_BID &&
+ ni[x].ni_status.ps_nodeid < low_id) {
+ low_id = ni[x].ni_status.ps_nodeid;
+ msg->m_msg = M_ACK;
+ msg->m_arg = ni[x].ni_status.ps_nodeid;
+ msg->m_seq = ni[x].ni_status.ps_seq;
+ }
+ }
+
+ if (acks == running)
+ return 3;
+ if (nacks)
+ return 2;
+ if (low_id != ctx->qc_my_id)
+ return 1;
+ return 0;
+}
+
+
+char *
+state_str(disk_node_state_t s)
+{
+ switch (s) {
+ case S_NONE:
+ return "None";
+ case S_EVICT:
+ return "Evicted";
+ case S_INIT:
+ return "Initializing";
+ case S_RUN:
+ return "Running";
+ case S_MASTER:
+ return "Master";
+ default:
+ return "ILLEGAL";
+ }
+}
+
+
+void
+update_local_status(qd_ctx *ctx, node_info_t *ni, int max, int score,
+ int score_req, int score_max)
+{
+ FILE *fp;
+ int x, need_close = 0;
+
+ if (!ctx->qc_status_file)
+ return;
+
+ if (strcmp(ctx->qc_status_file, "-") == 0) {
+ fp = stdout;
+ } else {
+ fp = fopen(ctx->qc_status_file, "w+");
+ if (fp == NULL)
+ return;
+ need_close = 1;
+ }
+
+ fprintf(fp, "Node ID: %d\n", ctx->qc_my_id);
+ fprintf(fp, "Score (current / min req. / max allowed): %d / %d / %d\n",
+ score, score_req, score_max);
+ fprintf(fp, "Current state: %s\n", state_str(ctx->qc_status));
+ fprintf(fp, "Current disk state: %s\n",
+ state_str(ctx->qc_disk_status));
+
+ fprintf(fp, "Visible Set: {");
+ for (x=0; x<max; x++) {
+ if (ni[x].ni_state >= S_RUN || ni[x].ni_status.ps_nodeid ==
+ ctx->qc_my_id)
+ fprintf(fp," %d", ni[x].ni_status.ps_nodeid);
+ }
+ fprintf(fp, " }\n");
+
+ if (!ctx->qc_master) {
+ fprintf(fp, "No master node\n");
+ goto out;
+ }
+
+ fprintf(fp, "Master Node ID: %d\n", ctx->qc_master);
+ fprintf(fp, "Quorate Set: {");
+ for (x=0; x<max; x++) {
+ if (is_bit_set(ni[ctx->qc_master-1].ni_status.ps_master_mask,
+ ni[x].ni_status.ps_nodeid-1,
+ sizeof(memb_mask_t))) {
+ fprintf(fp," %d", ni[x].ni_status.ps_nodeid);
+ }
+ }
+
+ fprintf(fp, " }\n");
+
+out:
+ fprintf(fp, "\n");
+ if (need_close)
+ fclose(fp);
+}
+
+
+
+int
+quorum_loop(qd_ctx *ctx, node_info_t *ni, int max)
+{
+ disk_msg_t msg = {0, 0, 0};
+ int low_id, bid_pending = 0, score, score_max, score_req;
+ memb_mask_t mask, master_mask;
+
+ ctx->qc_status = S_RUN;
+
+ _running = 1;
+ while (_running) {
+ /* Read everyone else's status */
+ read_node_blocks(ctx, ni, max);
+
+ /* Check for node transitions */
+ check_transitions(ctx, ni, max, mask);
+
+ /* Check heuristics and remove ourself if necessary */
+ get_my_score(&score, &score_max);
+
+ score_req = ctx->qc_scoremin;
+ if (score_req <= 0)
+ score_req = ((score_max + 1) / 2);
+
+ if (score < score_req) {
+ clear_bit(mask, (ctx->qc_my_id-1), sizeof(mask));
+ if (ctx->qc_status > S_NONE) {
+ clulog(LOG_NOTICE,
+ "Score insufficient for master "
+ "operation (%d/%d; max=%d); "
+ "downgrading\n",
+ score, score_req, score_max);
+ ctx->qc_status = S_NONE;
+ msg.m_msg = M_NONE;
+ ++msg.m_seq;
+ bid_pending = 0;
+ cman_poll_quorum_device(ctx->qc_ch, 0);
+ /* reboot??? */
+ }
+ } else {
+ set_bit(mask, (ctx->qc_my_id-1), sizeof(mask));
+ if (ctx->qc_status == S_NONE) {
+ clulog(LOG_NOTICE,
+ "Score sufficient for master "
+ "operation (%d/%d; max=%d); "
+ "upgrading\n",
+ score, score_req, score_max);
+ ctx->qc_status = S_RUN;
+ }
+ }
+
+ /* Find master */
+ ctx->qc_master = master_exists(ctx, ni, max, &low_id);
+
+ /* Figure out what to do based on what we know */
+ if (!ctx->qc_master &&
+ low_id == ctx->qc_my_id &&
+ ctx->qc_status == S_RUN &&
+ !bid_pending ) {
+ /*
+ If there's no master, and we are the lowest node
+ ID, make a bid to become master if we're not
+ already bidding.
+ */
+
+ clulog(LOG_DEBUG,"Making bid for master\n");
+ msg.m_msg = M_BID;
+ ++msg.m_seq;
+ bid_pending = 1;
+
+ } else if (!ctx->qc_master && !bid_pending) {
+
+ /* We're not the master, and we do not have a bid
+ pending. Check for voting on other nodes. */
+ do_vote(ctx, ni, max, &msg);
+ } else if (!ctx->qc_master && bid_pending) {
+
+ /* We're currently bidding for master.
+ See if anyone's voted, or if we should
+ rescind our bid */
+
+ /* Yes, those are all deliberate fallthroughs */
+ switch (check_votes(ctx, ni, max, &msg)) {
+ case 3:
+ clulog(LOG_INFO,
+ "Assuming master role\n");
+ ctx->qc_status = S_MASTER;
+ case 2:
+ msg.m_msg = M_NONE;
+ case 1:
+ bid_pending = 0;
+ default:
+ break;
+ }
+ } else if (ctx->qc_status == S_MASTER &&
+ ctx->qc_master != ctx->qc_my_id) {
+
+ /* We think we're master, but someone else claims
+ that they are master. */
+
+ clulog(LOG_CRIT,
+ "A master exists, but it's not me?!\n");
+ /* XXX Handle this how? Should not happen*/
+ /* reboot(RB_AUTOBOOT); */
+
+ } else if (ctx->qc_status == S_MASTER &&
+ ctx->qc_master == ctx->qc_my_id) {
+
+ /* We are the master. Poll the quorum device.
+ We can't be the master unless we score high
+ enough on our heuristics. */
+ check_cman(ctx, mask, master_mask);
+ cman_poll_quorum_device(ctx->qc_ch, 1);
+
+ } else if (ctx->qc_status == S_RUN && ctx->qc_master &&
+ ctx->qc_master != ctx->qc_my_id) {
+
+ /* We're not the master, but a master exists
+ Check to see if the master thinks we are
+ online. If we are, tell CMAN so. */
+ if (is_bit_set(
+ ni[ctx->qc_master-1].ni_status.ps_master_mask,
+ ctx->qc_my_id-1,
+ sizeof(memb_mask_t))) {
+ cman_poll_quorum_device(ctx->qc_ch, 1);
+ }
+ }
+
+ /* Write out our status */
+ if (qd_write_status(ctx, ctx->qc_my_id, ctx->qc_status,
+ &msg, mask, master_mask) != 0) {
+ clulog(LOG_ERR, "Error writing to quorum disk\n");
+ }
+
+ /* write out our local status */
+ update_local_status(ctx, ni, max, score, score_req, score_max);
+
+ /* Cycle. We could time the loop and sleep
+ usleep(interval-looptime), but this is fine for now.*/
+ if (_running)
+ sleep(ctx->qc_interval);
+ }
+
+ return 0;
+}
+
+
+/**
+ Tell the other nodes we're done (safely!).
+ */
+int
+quorum_logout(qd_ctx *ctx)
+{
+ /* Write out our status */
+ if (qd_write_status(ctx, ctx->qc_my_id, S_NONE,
+ NULL, NULL, NULL) != 0) {
+ clulog(LOG_WARNING,
+ "Error writing to quorum disk during logout\n");
+ }
+ return 0;
+}
+
+
+/**
+ Grab all our configuration data from CCSD
+ */
+int
+get_config_data(char *cluster_name, qd_ctx *ctx, struct h_data *h, int maxh,
+ int *cfh, int debug)
+{
+ int ccsfd = -1, loglevel = 4;
+ char query[256];
+ char *val;
+
+ clulog(LOG_DEBUG, "Loading configuration information\n");
+
+ ccsfd = ccs_force_connect(cluster_name, 1);
+ if (ccsfd < 0) {
+ clulog(LOG_CRIT, "Connection to CCSD failed; cannot start\n");
+ return -1;
+ }
+
+ ctx->qc_interval = 1;
+ ctx->qc_tko = 10;
+ ctx->qc_scoremin = 0;
+
+ /* Get log log_facility */
+ snprintf(query, sizeof(query), "/cluster/quorumd/@log_facility");
+ if (ccs_get(ccsfd, query, &val) == 0) {
+ clu_set_facility(val);
+ free(val);
+ }
+
+ /* Get log level */
+ snprintf(query, sizeof(query), "/cluster/quorumd/@log_level");
+ if (ccs_get(ccsfd, query, &val) == 0) {
+ loglevel = atoi(val);
+ free(val);
+ if (loglevel < 0)
+ loglevel = 4;
+
+ if (!debug)
+ clu_set_loglevel(loglevel);
+ }
+
+ /* Get interval */
+ snprintf(query, sizeof(query), "/cluster/quorumd/@interval");
+ if (ccs_get(ccsfd, query, &val) == 0) {
+ ctx->qc_interval = atoi(val);
+ free(val);
+ if (ctx->qc_interval < 1)
+ ctx->qc_interval = 1;
+ }
+
+ /* Get tko */
+ snprintf(query, sizeof(query), "/cluster/quorumd/@tko");
+ if (ccs_get(ccsfd, query, &val) == 0) {
+ ctx->qc_tko = atoi(val);
+ free(val);
+ if (ctx->qc_tko < 3)
+ ctx->qc_tko = 3;
+ }
+
+ /* Get votes */
+ snprintf(query, sizeof(query), "/cluster/quorumd/@votes");
+ if (ccs_get(ccsfd, query, &val) == 0) {
+ ctx->qc_votes = atoi(val);
+ free(val);
+ if (ctx->qc_votes < 0)
+ ctx->qc_votes = 0;
+ }
+
+ /* Get device */
+ snprintf(query, sizeof(query), "/cluster/quorumd/@device");
+ if (ccs_get(ccsfd, query, &val) == 0) {
+ ctx->qc_device = val;
+ }
+
+ /* Get label (overrides device) */
+ snprintf(query, sizeof(query), "/cluster/quorumd/@label");
+ if (ccs_get(ccsfd, query, &val) == 0) {
+ ctx->qc_label = val;
+ }
+
+ /* Get status file */
+ snprintf(query, sizeof(query), "/cluster/quorumd/@status_file");
+ if (ccs_get(ccsfd, query, &val) == 0) {
+ ctx->qc_status_file = val;
+ }
+
+ /* Get min score */
+ snprintf(query, sizeof(query), "/cluster/quorumd/@min_score");
+ if (ccs_get(ccsfd, query, &val) == 0) {
+ ctx->qc_scoremin = atoi(val);
+ free(val);
+ if (ctx->qc_scoremin < 0)
+ ctx->qc_scoremin = 0;
+ }
+
+ *cfh = configure_heuristics(ccsfd, h, maxh);
+
+ clulog(LOG_DEBUG,
+ "Quorum Daemon: %d heuristics, %d interval, %d tko, %d votes\n",
+ *cfh, ctx->qc_interval, ctx->qc_tko, ctx->qc_votes);
+
+ ccs_disconnect(ccsfd);
+
+ return 0;
+}
+
+
+int
+main(int argc, char **argv)
+{
+ cman_node_t me;
+ int cfh, rv;
+ qd_ctx ctx;
+ cman_handle_t ch;
+ node_info_t ni[MAX_NODES_DISK];
+ struct h_data h[10];
+ char debug = 0, foreground = 0;
+ char device[128];
+
+ while ((rv = getopt(argc, argv, "fd")) != EOF) {
+ switch (rv) {
+ case 'd':
+ debug = 1;
+ break;
+ case 'f':
+ foreground = 1;
+ default:
+ break;
+ }
+ }
+#if (defined(LIBCMAN_VERSION) && LIBCMAN_VERSION >= 2)
+ ch = cman_admin_init(NULL);
+#else
+ ch = cman_init(NULL);
+#endif
+ if (!ch) {
+ printf("Could not connect to cluster (CMAN not running?)\n");
+ return -1;
+ }
+
+ if (cman_get_node(ch, CMAN_NODEID_US, &me) < 0) {
+ printf("Could not determine local node ID; cannot start\n");
+ return -1;
+ }
+
+ qd_init(&ctx, ch, me.cn_nodeid);
+
+ signal(SIGINT, int_handler);
+
+ if (debug)
+ clu_set_loglevel(LOG_DEBUG);
+ if (foreground)
+ clu_log_console(1);
+
+ if (get_config_data(NULL, &ctx, h, 10, &cfh, debug) < 0) {
+ clulog_and_print(LOG_CRIT, "Configuration failed\n");
+ return -1;
+ }
+
+ if (ctx.qc_label) {
+ if (find_partitions("/proc/partitions",
+ ctx.qc_label, device,
+ sizeof(device), 0) != 0) {
+ clulog_and_print(LOG_CRIT, "Unable to match label"
+ " '%s' to any device\n",
+ ctx.qc_label);
+ return -1;
+ }
+
+ if (ctx.qc_device)
+ free(ctx.qc_device);
+
+ ctx.qc_device = strdup(device);
+
+ clulog(LOG_INFO, "Quorum Partition: %s Label: %s\n",
+ ctx.qc_device, ctx.qc_label);
+ } else if (ctx.qc_device) {
+ if (check_device(ctx.qc_device, NULL, NULL) != 0) {
+ clulog(LOG_CRIT,
+ "Specified partition %s does not have a "
+ "qdisk label\n", ctx.qc_device);
+ return -1;
+ }
+ }
+
+ if (!foreground)
+ daemon(0,0);
+
+ if (quorum_init(&ctx, ni, MAX_NODES_DISK, h, cfh) < 0) {
+ clulog_and_print(LOG_CRIT, "Initialization failed\n");
+ return -1;
+ }
+
+ cman_register_quorum_device(ctx.qc_ch, ctx.qc_device, ctx.qc_votes);
+ /*
+ XXX this always returns -1 / EBUSY even when it works?!!!
+
+ if ((rv = cman_register_quorum_device(ctx.qc_ch, ctx.qc_device,
+ ctx.qc_votes)) < 0) {
+ clulog_and_print(LOG_CRIT,
+ "Could not register %s with CMAN; "
+ "return = %d; error = %s\n",
+ ctx.qc_device, rv, strerror(errno));
+ return -1;
+ }
+ */
+
+ quorum_loop(&ctx, ni, MAX_NODES_DISK);
+ cman_unregister_quorum_device(ctx.qc_ch);
+
+ quorum_logout(&ctx);
+
+ qd_destroy(&ctx);
+
+ return 0;
+
+}
+
/cvs/cluster/cluster/cman/qdisk/mkqdisk.c,v --> standard output
revision 1.3.2.1
--- cluster/cman/qdisk/mkqdisk.c
+++ - 2006-07-21 18:01:40.340826000 +0000
@@ -0,0 +1,93 @@
+/**
+ Copyright Red Hat, Inc. 2006
+
+ This program is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by the
+ Free Software Foundation; either version 2, or (at your option) any
+ later version.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; see the file COPYING. If not, write to the
+ Free Software Foundation, Inc., 675 Mass Ave, Cambridge,
+ MA 02139, USA.
+
+ Author: Lon Hohberger <lhh at redhat.com>
+ */
+/**
+ @file Quorum disk utility
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <disk.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <platform.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+
+int
+main(int argc, char **argv)
+{
+ char device[128];
+ char *newdev = NULL, *newlabel = NULL;
+ int rv;
+
+ printf("mkqdisk v0.5\n");
+
+ while ((rv = getopt(argc, argv, "Lfc:l:h")) != EOF) {
+ switch (rv) {
+ case 'L':
+ /* List */
+ close(2);
+ return find_partitions("/proc/partitions",
+ NULL, NULL, 0, 1);
+ break;
+ case 'f':
+ close(2);
+ return find_partitions("/proc/partitions",
+ optarg, device,
+ sizeof(device), 0);
+ case 'c':
+ newdev = optarg;
+ break;
+ case 'l':
+ newlabel = optarg;
+ break;
+ case 'h':
+ printf("usage: mkqdisk -L | -f <label> | -c "
+ "<device> -l <label>\n");
+ return 0;
+ default:
+ break;
+ }
+ }
+
+ if (!newdev && !newlabel) {
+ printf("usage: mkqdisk -L | -f <label> | -c "
+ "<device> -l <label>\n");
+ return 1;
+ }
+
+ if (!newdev || !newlabel) {
+ printf("Both a device and a label are required\n");
+ return 1;
+ }
+
+ printf("Writing new quorum disk label '%s' to %s.\n",
+ newlabel, newdev);
+ printf("WARNING: About to destroy all data on %s; proceed [N/y] ? ",
+ newdev);
+ if (getc(stdin) != 'y') {
+ printf("Good thinking.\n");
+ return 0;
+ }
+
+ return qdisk_init(newdev, newlabel);
+}
/cvs/cluster/cluster/cman/qdisk/platform.h,v --> standard output
revision 1.2.2.1
--- cluster/cman/qdisk/platform.h
+++ - 2006-07-21 18:01:40.435441000 +0000
@@ -0,0 +1,59 @@
+/*
+ Copyright Red Hat, Inc. 2002-2003
+
+ The Red Hat Cluster Manager API Library is free software; you can
+ redistribute it and/or modify it under the terms of the GNU Lesser
+ General Public License as published by the Free Software Foundation;
+ either version 2.1 of the License, or (at your option) any later
+ version.
+
+ The Red Hat Cluster Manager API Library is distributed in the hope
+ that it will be useful, but WITHOUT ANY WARRANTY; without even the
+ implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+ PURPOSE. See the GNU Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+ USA.
+ */
+/** @file
+ * Defines for byte-swapping
+ */
+#ifndef __PLATFORM_H
+#define __PLATFORM_H
+
+#include <endian.h>
+#include <sys/param.h>
+#include <byteswap.h>
+#include <bits/wordsize.h>
+
+/* No swapping on little-endian machines */
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+#define le_swap16(x) (x)
+#define le_swap32(x) (x)
+#define le_swap64(x) (x)
+#else
+#define le_swap16(x) bswap_16(x)
+#define le_swap32(x) bswap_32(x)
+#define le_swap64(x) bswap_64(x)
+#endif
+
+/* No swapping on big-endian machines */
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+#define be_swap16(x) bswap_16(x)
+#define be_swap32(x) bswap_32(x)
+#define be_swap64(x) bswap_64(x)
+#else
+#define be_swap16(x) (x)
+#define be_swap32(x) (x)
+#define be_swap64(x) (x)
+#endif
+
+
+#define swab16(x) x=be_swap16(x)
+#define swab32(x) x=be_swap32(x)
+#define swab64(x) x=be_swap64(x)
+
+
+#endif /* __PLATFORM_H */
/cvs/cluster/cluster/cman/qdisk/proc.c,v --> standard output
revision 1.2.2.1
--- cluster/cman/qdisk/proc.c
+++ - 2006-07-21 18:01:40.521042000 +0000
@@ -0,0 +1,128 @@
+/**
+ Copyright Red Hat, Inc. 2006
+
+ This program is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by the
+ Free Software Foundation; either version 2, or (at your option) any
+ later version.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; see the file COPYING. If not, write to the
+ Free Software Foundation, Inc., 675 Mass Ave, Cambridge,
+ MA 02139, USA.
+
+ Author: Lon Hohberger <lhh at redhat.com>
+ */
+/**
+ @file Quorum disk /proc/partition scanning functions
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <disk.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <platform.h>
+#include <stdlib.h>
+#include <string.h>
+
+
+int
+check_device(char *device, char *label, quorum_header_t *qh)
+{
+ int fd = -1, ret = -1;
+ quorum_header_t qh_local;
+
+ if (!qh)
+ qh = &qh_local;
+
+ fd = qdisk_validate(device);
+ if (fd < 0) {
+ perror("qdisk_verify");
+ return -1;
+ }
+
+ fd = qdisk_open(device);
+ if (fd < 0) {
+ perror("qdisk_open");
+ return -1;
+ }
+
+ if (qdisk_read(fd, OFFSET_HEADER, qh, sizeof(*qh)) == sizeof(*qh)) {
+ swab_quorum_header_t(qh);
+ if (qh->qh_magic == HEADER_MAGIC_NUMBER) {
+ if (!label || !strcmp(qh->qh_cluster, label)) {
+ ret = 0;
+ }
+ }
+ }
+
+ qdisk_close(&fd);
+
+ return ret;
+}
+
+
+int
+find_partitions(const char *partfile, const char *label,
+ char *devname, size_t devlen, int print)
+{
+ char line[4096];
+ FILE *fp;
+ int minor, major;
+ unsigned long long blkcnt;
+ char device[128];
+ char realdev[256];
+ quorum_header_t qh;
+
+ fp = fopen(partfile, "r");
+ if (!fp)
+ return -1;
+
+ while (fgets(line, sizeof(line), fp) != NULL) {
+ if (strlen(line) > 128 + (22) /* 5 + 5 + 11 + 1 */) {
+ /*printf("Line too long!\n");*/
+ continue;
+ }
+
+ /* This line is taken from 2.6.15.4's proc line */
+ sscanf(line, "%4d %4d %10llu %s", &major, &minor,
+ &blkcnt, device);
+
+ if (strlen(device)) {
+ snprintf(realdev, sizeof(realdev),
+ "/dev/%s", device);
+ if (check_device(realdev, (char *)label, &qh) != 0)
+ continue;
+
+ if (print) {
+ printf("%s:\n", realdev);
+ printf("\tMagic: %08x\n", qh.qh_magic);
+ printf("\tLabel: %s\n", qh.qh_cluster);
+ printf("\tCreated: %s",
+ ctime((time_t *)&qh.qh_timestamp));
+ printf("\tHost: %s\n\n", qh.qh_updatehost);
+ }
+
+ if (devname && devlen) {
+ /* Got it */
+ strncpy(devname, realdev, devlen);
+ fclose(fp);
+ return 0;
+ }
+ }
+ }
+
+ fclose(fp);
+
+ if (print)
+ /* No errors if we're just printing stuff */
+ return 0;
+
+ errno = ENOENT;
+ return -1;
+}
/cvs/cluster/cluster/cman/qdisk/score.c,v --> standard output
revision 1.2.2.1
--- cluster/cman/qdisk/score.c
+++ - 2006-07-21 18:01:40.622758000 +0000
@@ -0,0 +1,383 @@
+/**
+ Copyright Red Hat, Inc. 2006
+
+ This program is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by the
+ Free Software Foundation; either version 2, or (at your option) any
+ later version.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; see the file COPYING. If not, write to the
+ Free Software Foundation, Inc., 675 Mass Ave, Cambridge,
+ MA 02139, USA.
+
+ Author: Lon Hohberger <lhh at redhat.com>
+ */
+/**
+ @file Quorum daemon scoring functions + thread.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <fcntl.h>
+#include <pthread.h>
+#include <string.h>
+#include <ccs.h>
+#include <clulog.h>
+#include "score.h"
+
+static pthread_mutex_t sc_lock = PTHREAD_MUTEX_INITIALIZER;
+static int _score = 0, _maxscore = 0, _score_thread_running = 0;
+static pthread_t score_thread = (pthread_t)0;
+
+struct h_arg {
+ struct h_data *h;
+ int count;
+};
+
+
+/*
+ XXX Messy, but works for now...
+ */
+void
+nullify(void)
+{
+ int fd[3];
+
+ close(0);
+ close(1);
+ close(2);
+
+ fd[0] = open("/dev/null", O_RDONLY);
+ if (fd[0] != 0)
+ dup2(fd[0], 0);
+ fd[1] = open("/dev/null", O_WRONLY);
+ if (fd[1] != 1)
+ dup2(fd[1], 1);
+ fd[2] = open("/dev/null", O_WRONLY);
+ if (fd[2] != 2)
+ dup2(fd[2], 2);
+}
+
+
+/**
+ Spin off a user-defined heuristic
+ */
+static int
+fork_heuristic(struct h_data *h)
+{
+ int pid;
+ char *argv[4];
+ time_t now;
+
+ if (h->childpid) {
+ errno = EINPROGRESS;
+ return -1;
+ }
+
+ now = time(NULL);
+ if (now < h->nextrun)
+ return 0;
+
+ h->nextrun = now + h->interval;
+
+ pid = fork();
+ if (pid < 0)
+ return -1;
+
+ if (pid) {
+ h->childpid = pid;
+ return 0;
+ }
+
+ argv[0] = "/bin/sh";
+ argv[1] = "-c";
+ argv[2] = h->program;
+ argv[3] = NULL;
+
+ nullify();
+
+ execv("/bin/sh", argv);
+
+ printf("Execv failed\n");
+ return 0;
+}
+
+
+/**
+ Total our current score
+ */
+static void
+total_score(struct h_data *h, int max, int *score, int *maxscore)
+{
+ int x;
+
+ *score = 0;
+ *maxscore = 0;
+
+ for (x = 0; x < max; x++) {
+ *maxscore += h[x].score;
+ if (h[x].available)
+ *score += h[x].score;
+ }
+}
+
+
+/**
+ Check for response from a user-defined heuristic / script
+ */
+static int
+check_heuristic(struct h_data *h, int block)
+{
+ int ret;
+ int status;
+
+ if (h->childpid == 0)
+ return 0;
+
+ ret = waitpid(h->childpid, &status, block?0:WNOHANG);
+ if (!block && ret == 0)
+ return 0;
+
+ h->childpid = 0;
+ h->available = 0;
+ if (ret < 0 && errno == ECHILD)
+ return -1;
+ if (!WIFEXITED(status))
+ return 0;
+ if (WEXITSTATUS(status) != 0)
+ return 0;
+ h->available = 1;
+ return 0;
+}
+
+
+/**
+ Kick off all available heuristics
+ */
+static int
+fork_heuristics(struct h_data *h, int max)
+{
+ int x;
+
+ for (x = 0; x < max; x++)
+ fork_heuristic(&h[x]);
+ return 0;
+}
+
+
+/**
+ Check all available heuristics
+ */
+static int
+check_heuristics(struct h_data *h, int max, int block)
+{
+ int x;
+
+ for (x = 0; x < max; x++)
+ check_heuristic(&h[x], block);
+ return 0;
+}
+
+
+/**
+ Read configuration data from CCS into the array provided
+ */
+int
+configure_heuristics(int ccsfd, struct h_data *h, int max)
+{
+ int x = 0;
+ char *val;
+ char query[128];
+
+ if (!h || !max)
+ return -1;
+
+ do {
+ h[x].program = NULL;
+ h[x].available = 0;
+ h[x].interval = 2;
+ h[x].score = 1;
+ h[x].childpid = 0;
+ h[x].nextrun = 0;
+
+ /* Get program */
+ snprintf(query, sizeof(query),
+ "/cluster/quorumd/heuristic[%d]/@program", x+1);
+ if (ccs_get(ccsfd, query, &val) != 0)
+ /* No more */
+ break;
+ h[x].program = val;
+
+ /* Get score */
+ snprintf(query, sizeof(query),
+ "/cluster/quorumd/heuristic[%d]/@score", x+1);
+ if (ccs_get(ccsfd, query, &val) == 0) {
+ h[x].score = atoi(val);
+ free(val);
+ if (h[x].score <= 0)
+ h[x].score = 1;
+ }
+
+ /* Get query interval */
+ snprintf(query, sizeof(query),
+ "/cluster/quorumd/heuristic[%d]/@interval", x+1);
+ if (ccs_get(ccsfd, query, &val) == 0) {
+ h[x].interval = atoi(val);
+ free(val);
+ if (h[x].interval <= 0)
+ h[x].interval = 2;
+ }
+
+ clulog(LOG_DEBUG, "Heuristic: '%s' score=%d interval=%d\n",
+ h[x].program, h[x].score, h[x].interval);
+
+ } while (++x < max);
+
+ clulog(LOG_DEBUG, "%d heuristics loaded\n", x);
+
+ return x;
+}
+
+
+/**
+ Return the current score + maxscore to the caller
+ */
+int
+get_my_score(int *score, int *maxscore)
+{
+ pthread_mutex_lock(&sc_lock);
+ *score = _score;
+ *maxscore = _maxscore;
+ pthread_mutex_unlock(&sc_lock);
+
+ return 0;
+}
+
+
+/**
+ Loop for the scoring thread.
+ */
+void *
+score_thread_main(void *arg)
+{
+ struct h_arg *args = (struct h_arg *)arg;
+ int score, maxscore;
+
+ while (_score_thread_running) {
+ fork_heuristics(args->h, args->count);
+ check_heuristics(args->h, args->count, 0);
+ total_score(args->h, args->count, &score, &maxscore);
+
+ pthread_mutex_lock(&sc_lock);
+ _score = score;
+ _maxscore = maxscore;
+ pthread_mutex_unlock(&sc_lock);
+
+ if (_score_thread_running)
+ sleep(1);
+ }
+
+ free(args->h);
+ free(args);
+ printf("Score thread going away\n");
+ return (NULL);
+}
+
+
+/**
+ Stop the score thread for shutdown / reconfiguration
+ */
+int
+stop_score_thread(void)
+{
+ void *ret;
+
+ if (!_score_thread_running)
+ return 0;
+
+ _score_thread_running = 0;
+ pthread_join(score_thread, &ret);
+
+ return 0;
+}
+
+
+/**
+ Start the score thread. h is copied into an argument which is
+ passed in as the arg parameter in the score thread, so it is safe
+ to pass in h if it was allocated on the stack.
+ */
+int
+start_score_thread(struct h_data *h, int count)
+{
+ pthread_attr_t attrs;
+ struct h_arg *args;
+
+ if (!h || !count)
+ return -1;
+
+ args = malloc(sizeof(struct h_arg));
+ if (!args)
+ return -1;
+
+ args->h = malloc(sizeof(struct h_data) * count);
+ if (!args->h) {
+ free(args);
+ return -1;
+ }
+
+ memcpy(args->h, h, (sizeof(struct h_data) * count));
+ args->count = count;
+
+ _score_thread_running = 1;
+ pthread_attr_init(&attrs);
+ pthread_attr_setinheritsched(&attrs, PTHREAD_INHERIT_SCHED);
+ pthread_create(&score_thread, &attrs, score_thread_main, args);
+ pthread_attr_destroy(&attrs);
+
+ if (score_thread)
+ return 0;
+ _score_thread_running = 0;
+ return -1;
+}
+
+
+#if 0
+int
+main(int argc, char **argv)
+{
+ struct h_data h[10];
+ int max = 0, score, maxscore, ccsfd;
+
+ ccsfd = ccs_force_connect("test", 1);
+ if (ccsfd < 0)
+ return -1;
+ max = configure_heuristics(ccsfd, h, 10);
+ ccs_disconnect(ccsfd);
+
+ start_score_thread(h, max);
+ max = 0;
+ while (max < 10) {
+ get_my_score(&score,&maxscore);
+ printf("current %d/%d\n", score, maxscore);
+ sleep(1);
+ ++max;
+ }
+ stop_score_thread();
+
+ get_my_score(&score,&maxscore);
+ printf("final! %d/%d\n", score, maxscore);
+
+ return 0;
+}
+#endif
+
/cvs/cluster/cluster/cman/qdisk/score.h,v --> standard output
revision 1.2.2.1
--- cluster/cman/qdisk/score.h
+++ - 2006-07-21 18:01:40.712579000 +0000
@@ -0,0 +1,60 @@
+/**
+ Copyright Red Hat, Inc. 2006
+
+ This program is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by the
+ Free Software Foundation; either version 2, or (at your option) any
+ later version.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; see the file COPYING. If not, write to the
+ Free Software Foundation, Inc., 675 Mass Ave, Cambridge,
+ MA 02139, USA.
+
+ Author: Lon Hohberger <lhh at redhat.com>
+ */
+/**
+ @file Quorum daemon scoring functions + thread header file
+ */
+#ifndef _SCORE_H
+#define _SCORE_H
+
+#include <time.h>
+#include <sys/time.h>
+#include <sys/types.h>
+
+struct h_data {
+ char * program;
+ int score;
+ int available;
+ int interval;
+ pid_t childpid;
+ time_t nextrun;
+};
+
+/*
+ Grab score data from CCSD
+ */
+int configure_heuristics(int ccsfd, struct h_data *hp, int max);
+
+/*
+ Stop the thread which runs the scoring applets.
+ */
+int stop_score_thread(void);
+
+/*
+ Start the thread which runs the scoring applets
+ */
+int start_score_thread(struct h_data *h, int count);
+
+/*
+ Get our score + maxscore
+ */
+int get_my_score(int *score, int *maxscore);
+
+#endif
More information about the Cluster-devel
mailing list