/*
  Copyright Mission Critical Linux, 2000

  Kimberlite is free software; you can redistribute it and/or modify it
  under the terms of the GNU General Public License as published by the
  Free Software Foundation; either version 2, or (at your option) any
  later version.

  Kimberlite is distributed in the hope that it will be useful, but
  WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  General Public License for more details.

  You should have received a copy of the GNU General Public License
  along with Kimberlite; see the file COPYING.  If not, write to the
  Free Software Foundation, Inc.,  675 Mass Ave, Cambridge,
  MA 02139, USA.
*/
/*
 *  $Id: quorumd.c,v 1.33 2000/11/20 22:28:04 burke Exp $
 *
 *  Copyright (C) 2000 Mission Critical Linux, LLC
 *
 *  author: Tim Burke <burke@missioncriticallinux.com>
 *  comms stuff: Jeff Moyer <moyer@missioncriticallinux.com>
 *  authentication bits: Jeff Moyer
 *
 * quorum.c
 *
 * This file implements the quorum daemon functionality which is used to
 * monitor the cluster member's status at the disk level.  It runs
 * periodically and checks the disk for signs of activity.  Upon changes
 * in node status it calls back to the upper layer software so it can
 * respond accordingly.  Also interacts with the power switch daemon
 * to monitor power switch status and shoot failed nodes as an IO barrier.
 */
#include <stdio.h>
#include <unistd.h>
#include <string.h>
#include <fcntl.h>
#include <sys/types.h>
#include <sys/param.h>
#include <sys/stat.h>
#include <sys/wait.h>
#include <sys/time.h>
#include <sys/errno.h>
#include <signal.h>
#include <sys/mman.h>
#include <stdlib.h>
#include <sys/socket.h>
#include <sys/syslog.h>
#include <assert.h>
#include <libgen.h>
#include <sys/reboot.h>

#include <logger.h>
#include "diskstate.h"
#include "disk_proto.h"
#include <msgsvc.h>
#include <clusterdefs.h>
#include <parseconf.h>
#include <power.h>
#include <clucfg.h>
#include "diskcomms.h"

#define clu_unlock()     clu_un_lock()

/*
 * Structure used to represent operational statistics.
 */
typedef struct {
	ulong  goodWrites;       /* Successful update to node status block */
	ulong  errorWrites;	 /* Error count for node status block */
	ulong  goodReads;        /* Successful reads of partner's block */
	ulong  errorReads;	 /* Error count on reading partner's block */
	time_t maxQuorumdDuration; /* longest time thus far per iteration of 
				      quorumd */
	ulong  tooLongQuorumd;   /* Number of times quorum check loop took 
				    too long */
	ulong  powerSwTimeout;   /* Unable to contact power switch */
} QuorumStats;

/*
 * Function prototypes.
 */
static int monitor_cluster_daemons(void);
static int update_ondisk_status(void);
static int get_partner_status(void);
static int shoot_partner(void);
static void quorumd_body(void);
static int is_partner_member(void);
static int spawn_daemon(const char *daemon_name);
static int spawn_all_daemons(void);
static int start_quorumd(void);
static int stop_quorumd(int complain);
static void logStateChange(int nodeNum, int state);
static int establishConfigParams(int hup);
static int checkPowerSwitch(void);
static int notify_sm(int nodeNum, int state);
static int notifyMemberConfigChange(void); 
static int verifyTimestampNewer(time_t newtime);
static void updateConfigTimestamp(time_t tstamp);
static void configChangeCheck(time_t configDBTimestamp);
static pid_t get_daemons_pid(char *prog);
void node_status_init_state(NodeStatusBlock *statb);

void consider_reboot(void);
int getNodeState(int nodeNum);
int notifySMStartup(void);
void printQuorumStats(void);
void processConfigChange(void);

/*
 * diskcomms.c
 */
void fill_generic_hdr(generic_msg_hdr *hdr, int command, int len);
int process_disk_command(quorumdConnectionSt *cnxp);
int process_message(quorumdConnectionSt *cnxp);
int process_requests(int secs);
void close_open_comms(void);

static int 	debug = 1;
/*
 * The following are default values for configurable parameters.
 */
#define MAX_SAME_TIMESTAMP_NETUP   12	// Max same before failure, while 
					// partner is network heartbeating
#define MAX_SAME_TIMESTAMP_NETDOWN   7	// Max same before failure, partner
					// stopped network heartbeat
#define DEFAULT_PING_INTERVAL	2	// Delay in seconds between iterations
#define DEFAULT_POWER_SWITCH_CHECK_INTERVAL	(DEFAULT_PING_INTERVAL * 6)
#define DEFAULT_RECENT_POWER_STATUS 75	// Max safe time without power status
#define DEFAULT_CATEGORY_SCAN_DELAY 30	// Delay between scans of next category

static const char *version __attribute__ ((unused)) = "$Revision: 1.33 $";

/*
 * .............Configurable Parameters...................
 *
 * The following tuning knobs are intended to allow customization.
 */
/*
 * We tolerate a few IO errors before reacting.
 * This parameter defines how many consecutive errors are needed
 * to declare a true IO failure condition.  It is intended to avoid
 * over-reacting to an intermittent error.
 *
 * Historical note: At one point I had this set to 3.  In this case it
 * took about a minute to go through all the loop iterations and retries
 * (done while a 1 node cluster).  In a 2 node cluster you would have been
 * shot well before the 1 minute timeframe.  So I dropped it back to 1 in
 * the hopes of allowing the system to shut down some services cleanly.
 */
static int max_consecutive_io_errors = MAX_CONSECUTIVE_IO_ERRORS;
/*
 * The main mechanism used to detect if a partner node has died is to
 * read in the timestamp to see if it has changed.  Its probably not
 * prudent to assume that a partner is dead merely after missing only one
 * timestamp update.  Perhaps they hit an IO activity spike or something like
 * that.  This parameter defines the number of consecutive polls that the
 * timestamp is allowed to remain unchanging before writing off the 
 * partner cluster node.  If the heartbeat daemon believes the other node to
 * be up, we will give it longer to update its disk timestamp.  This allows
 * us to have quicker failover times for the case where a node is truly down;
 * while allowing us to not prematurely suspect a node is down due to an
 * IO activity spike.
 */
static int max_same_timestamp_netup = MAX_SAME_TIMESTAMP_NETUP;
static int max_same_timestamp_netdown = MAX_SAME_TIMESTAMP_NETDOWN;

/*
 * This parameter specifies the number of seconds to delay between
 * each iteration of the quorumd process.  It controls how often the
 * interval timer is updated on disk, as well as how often the 
 * partner node's interval timer is read in.
 */
static int quorumd_interval = DEFAULT_PING_INTERVAL;
/*
 * This parameter specifies the maximum number of seconds after the last
 * successful communication with the power switch that we are willing to
 * tollerate when taking over services in a node failure shootdown.
 */
static int recentPowerSwitchConnectivity = DEFAULT_RECENT_POWER_STATUS;
/*
 * This parameter specifies the number of seconds between scans of
 * a category of state information on the shared state partition.  This
 * controls the read frequency which indirectly results in a repair operation
 * should the partition become corrupted due to user error.
 */
static int categoryScanDelay = DEFAULT_CATEGORY_SCAN_DELAY;
 
/*......... end of configurable parameters ................*/

/*
 * External functions not in header files.
 */
extern void daemon_init(char *prog);         
extern int  check_pid_valid(pid_t pid, char *prog);
extern int quorumd_registered(void);
extern int quorumd_regcheck(time_t);

/*
 * Globals
 */
static time_t            evil_incarnation_num = (time_t)0;
static time_t            lastPowerSwitchStatus = 0;
static QuorumStats       quorum_stats;
static NodeStatusBlock  *myStatus = NULL;
static NodeStatusBlock  *partner_status = NULL;
static NodeStatusBlock  *myReadStatus = NULL;
static int 	         myNodenum = -1;
static int 	         partnerNodenum = -1;
static off_t	         offsetOfMyStatusBlock;
static off_t	         offsetOfPartnerStatusBlock;
static time_t            myStartupTimestamp = 0;
static time_t 		 partnerConfigDBTimestamp = 0;
static int               prior_partner_state = NODE_DOWN;
static int               partner_state = NODE_DOWN;
static int		 power_switch_type = SWT_UNSPECIFIED;

/*
 * Used by diskcomms.c
 */
time_t            last_powerd_contact = (time_t)0;
time_t            last_sm_contact = (time_t)0;
pid_t             pid_quorumd = (pid_t)0;
pid_t             sm_daemon_pid = (pid_t)0;
pid_t		  sm_daemon_fork_pid = (pid_t)0;
pid_t             powerd_daemon_pid = (pid_t)0;
pid_t             hb_daemon_pid = (pid_t)0;
msg_handle_t      listenFD = -1;

extern int clu_msg_based_shoot;


#define MAX_CONNECT_BACKLOG 10
quorumdConnectionSt     *cnx_list = NULL;

#define FIVE_MINUTES (5 * 60)
#define TWO_MINUTES (2 * 60)
static time_t max_daemon_startup_time = (time_t)FIVE_MINUTES;
static time_t max_daemon_ping_time = (time_t)TWO_MINUTES;

#define HEARTBEAT_DAEMON  "hb"
#define POWER_DAEMON      "powerd"
#define SVCMGR_DAEMON     "svcmgr"

/*
 * Called if the SM stops sending us periodic ALIVE messages.
 */
void
consider_reboot(void)
{
	int partnersState;

	partnersState = getNodeState(partnerNodenum);
	if (partnersState < 0) {
		clulog(LOG_ERR, "consider_reboot: Unable to determine"
		       " partner's state.\n");
		return;
	}
	if (partnersState == NODE_UP) {
		clulog(LOG_ERR,"consider_reboot: Partner is up, so reboot.\n");
		/*
		 * In an attempt to glean some debugging information, send SM
		 * a signal that should cause it to drop a core file.
		 */
		clulog(LOG_ERR, "consider_reboot: Sending SIGBUS to SM.\n");
		kill(sm_daemon_pid, SIGBUS);
		shut_myself_down("Cluster Instability: Service Manager "
				 "not running.");
		/* NOTREACHED */
		return;
	}

	clulog(LOG_ERR, "consider_reboot: Partner is NOT up, defer reboot.\n");
	return;
}


/*
 * Called to calculate if we haven't gotten any messages from a daemon
 * in a long time, enough to declare it dead.  
 * Arg: last_contact the timestamp of last successful contact with the daemon.
 * Returns: 1 - the daemon is healthy
 *          0 - haven't heard from the daemon in too long; declare it failed.
 */
static int
daemon_alive(time_t last_contact, char *who)
{
	time_t time_now = time(NULL);

	if (last_contact == (time_t)0) { 
		/*
		 * If we are never recieve contact from a crucial deamon 
		 * after a lengthy period of time, get upset.
		 */
		if ((time_now - myStartupTimestamp) > max_daemon_startup_time){
			clulog(LOG_DEBUG, "daemon_alive: no contact with %s.\n"
			       , who);
			return(0); // daemon down
		}
		return(1); // Can't write him off quite yet.
	}
	if ((time_now - last_contact) > max_daemon_ping_time) {
		return(0); // daemon down
	}

	return(1); // healthy daemon
}


/*
 * Monitor to see if a crucial cluster daemon has gone down. "Crucial" refers
 * to any daemon requried to failover/start/stop services.
 * Take corrective action when one goes away.
 */
static int
monitor_cluster_daemons()
{
	int retval=0;

	if (daemon_alive(last_sm_contact, "SM") == 0) {
		clulog(LOG_ERR, "monitor_cluster_daemons: SM is down!\n");

		if (sm_daemon_pid != 0) {
			retval = check_pid_valid(sm_daemon_pid,SM_COMMAND_NAME);
			if (retval == 1) {
				clulog(LOG_ERR, "monitor_cluster_daemons: SM"
				       " non-responsive.\n");
			} else {
				clulog(LOG_ERR, "monitor_cluster_daemons: SM "
				       "non-existent.\n");
			}
		}
		retval = -1;
	}

	if (daemon_alive(last_powerd_contact, "Powerd") == 0) {
		clulog(LOG_ERR, "monitor_cluster_daemons: Powerd is down, "
		       "unable to failover services!\n");
	}

	return(retval);
}


static void
logStateChange(int nodeNum, int state)
{
	char s[256];

	strcpy(s, "Notify SM: ");
	if (nodeNum == myNodenum) {
		strcat(s, "I am ");
	} else if (nodeNum == partnerNodenum) {
		strcat(s, "partner is ");
	} else {
		clulog(LOG_ERR, "logStateChange: bogus nodeNum %d\n",nodeNum);
	}

	if (state == NODE_DOWN) {
		strcat(s, "DOWN.\n");
	} else {
		strcat(s, "UP.\n");
	}

	clulog(LOG_DEBUG, "%s", s);
}


/*
 * Call over to the Service Manager to notify it that a node transition
 * has occurred.
 * Returns: 0 - notification successfully delivered
 *         -1 - notification failure
 */
static int
notify_sm(int nodeNum, int state)
{
	int retval;
	int smFD = -1; /* File descriptor used to communicate with SM */
	DiskMessageSt sendMsgBuf;

	if (debug) {
		logStateChange(nodeNum, state);
	}

	smFD = msg_open(PROCID_SVCMGR, myNodenum);
	if (smFD < 0) {
		clulog(LOG_CRIT, "notify_sm: unable to connect to SM.\n");
		return(-1);
	}
	fill_generic_hdr(&sendMsgBuf.hdr, DISK_NODE_STATECHANGE, 
			 DISK_MESSAGE_SIZE);
	sendMsgBuf.data.statusMsg.nodeNumber = nodeNum;
	sendMsgBuf.data.statusMsg.nodeStatus = state;
	retval = msg_send(smFD, &sendMsgBuf, DISK_MESSAGE_SIZE);
	if (retval != DISK_MESSAGE_SIZE){
		clulog(LOG_CRIT, "notify_sm: msg_send to SM failed"
		       ", %d.\n", retval);
	}

	msg_close(smFD);
	return(0);
}


/*
 * Called to monitor if the power switch is operational.  Also keeps a record
 * of the last successful contact we had with the power switch.
 * Returns: 0 - success
 *	    EIO - switch reports error status
 *	    ETIMEDOUT - unable to contact switch, true status unknown
 *          ECONNREFUSED - communication failure to powerd
 * Side Effects: If successful communication to the switch is performed, a
 * global lastPowerSwitchStatus is updated to the current time.  This gets
 * updated both for indications of successful switch status as well as
 * confirmed switch failure status.
 */
static int
checkPowerSwitch(void)
{
	int               len, retval, auth=0;
	msg_handle_t      connfd = -1;
	generic_msg_hdr   msghdr;
	PswitchStatusMsg  msg;

	memset(&msghdr, 0, sizeof(msghdr));
	memset(&msg, 0, sizeof(msg));

	/*
	 * Create and send the message header.
	 */
	fill_generic_hdr(&msghdr, PSWITCH_QUERY, 0);
	connfd = msg_open(PROCID_POWERD, myNodenum);
	if (connfd < 0) {
		clulog(LOG_ERR, "checkPowerSwitch: can't open connection"
		       " to powerd.\n");
		return(ECONNREFUSED);
	}
	retval = msg_send(connfd, &msghdr, sizeof(generic_msg_hdr));
	if (retval < (int)sizeof(generic_msg_hdr)) {
		msg_close(connfd);
		clulog(LOG_ERR, "checkPowerSwitch: can't send "
		       "message to powerd.\n");
		return(ECONNREFUSED);
	}

	/*
	 * Now get the response.
	 */
	len = msg_receive_timeout(connfd, &msg, sizeof(msg), &auth, 5);
	msg_close(connfd);

	if (len <= 0) {
		clulog(LOG_ERR, "checkPowerSwitch: can't get reply"
		       " from powerd.\n");
		return(ECONNREFUSED);
	}
	if (!auth) {
		clulog(LOG_CRIT, "checkPowerSwitch: Unauthorized response "
		       "received from supposed power daemon.  This is "
		       "probably a bug.\n");
		return -1;
	}

	lastPowerSwitchStatus = msg.timestamp;
	return(msg.status);
}

/*
 * Called when the Service Manager informs us that its up.  Once we figure
 * out the state of the partner we can inform SM of whose up and whose down.
 * The state of both nodes must be determined before replying in order to
 * initialize the synchronization locks properly before they are used.
 */
int
notifySMStartup(void)
{
	int partnerState;
	int retval;

	partnerState = getNodeState(partnerNodenum); 
	retval = notify_sm(myNodenum, getNodeState(myNodenum));
	if (retval != 0) {
		clulog(LOG_ERR, "notifySMStartup: unable to tell SM "
		       "we're up.\n");
		return(retval);
	}
	retval = notify_sm(partnerNodenum, partnerState);
	if (retval != 0) {
		clulog(LOG_ERR, "notifySMStartup: unable to tell SM "
		       "partner's status.\n");
	} else {
		clulog(LOG_DEBUG, "notifySMStartup: provided node status to SM.\n");
	}

	return(retval);
}



void
fill_generic_hdr(generic_msg_hdr *hdr, int command, int len)
{
	hdr->magic = GENERIC_HDR_MAGIC;
	hdr->command = command;
	hdr->length = len;
}


/*
 * Call over to the heartbeat daemon to see if it considers the partner
 * to be up.
 * Returns: greater than 0 - the partner is active from a heartbeat perspective
 *          0 - the partner is inactive from a heartbeat perspective
 *	    less than zero - unable to communicate with heartbeat
 */
static int
check_hb_active(void)
{
	int retval, auth=0;
	int query=HB_QUERY_NETUP;
	ssize_t reply;
	msg_handle_t heartbeatFD=-1;
	int hbStatus=0;

	clulog(LOG_DEBUG, "partnerHeartbeatActive: checking in "
	                  "with heartbeat.\n");
	heartbeatFD = msg_open(PROCID_HEARTBEAT, myNodenum);
	if (heartbeatFD < 0) {
		clulog(LOG_ERR, "partnerHeartbeatActive: unable to "
		       "open connection to heartbeat daemon.  "
		       "Error: %s\n", strerror(errno));
		return max_same_timestamp_netup; /* benefit of the doubt */
	}
	retval = msg_send(heartbeatFD, &query, sizeof(query));
	if (retval != sizeof(query)){
		msg_close(heartbeatFD);
		clulog(LOG_ERR, "partnerHeartbeatActive: send failed, %d.\n", 
		       retval);
		return max_same_timestamp_netup; /* benefit of the doubt */
	}
	reply = msg_receive_timeout(heartbeatFD, &hbStatus, 
				    sizeof(hbStatus), &auth, 5);
	msg_close(heartbeatFD);

	if (!auth) {
		clulog(LOG_CRIT, "partnerHeartbeatActive: Unauthorized "
		       "response from supposed hb daemon.  This should not "
		       "happen.\n");
		return max_same_timestamp_netup; /* benefit of the doubt */
	}
	if (reply < 0) {
		clulog(LOG_ERR, "partnerHeartbeatActive: Error in receive.\n");
		return max_same_timestamp_netup; /* benefit of the doubt */
	} else if (reply == 0) {
		clulog(LOG_ERR, "partnerHeartbeatActive: Receive timed out."
		       "  Odd.\n");
		return max_same_timestamp_netup; /* benefit of the doubt */
	}

	/*
	 * Look at bitmask to see if partner is listed as up, 
	 * return accordingly.
	 */
	clulog(LOG_DEBUG, "partnerHeartbeatActive: HB returned bitmask of "
	       "0x%x\n", hbStatus);
	if (hbStatus == 0) {
		retval = max_same_timestamp_netdown;
		clulog(LOG_INFO, "partnerHeartbeatActive: HB says "
		       "partner is DOWN.\n");
	} else {
		retval = max_same_timestamp_netup;
		clulog(LOG_INFO, "partnerHeartbeatActive: HB says "
		       "partner is UP.\n");
	}

	return(retval);
}


/*
 * Caller must hold the cluster lock.
 */
static int
update_ondisk_status(void)
{
	int retries = 0;

	myStatus->timestamp = time(NULL);

	while (retries++ < max_consecutive_io_errors) {
		if (writeStatusBlock(offsetOfMyStatusBlock,myStatus,1) != 0) {
			clulog(LOG_ERR, "update_ondisk_status: unable to "
			       "update on-disk status block.\n");
			quorum_stats.errorWrites++;
			myStatus->timestamp = time(NULL);
		} else {
			quorum_stats.goodWrites++;
			return 0;
		}
	}

	/*
	 * If we get here, we couldn't access the disk.
	 */
	clulog(LOG_EMERG, "update_ondisk_status: unable to write to "
	       "quorum partition.\n");

	return -1;
}

static int
get_partner_status(void)
{
	int retries = 0;

	while (retries++ < max_consecutive_io_errors) {
		if (readStatusBlock(offsetOfPartnerStatusBlock, 
				    partner_status, 1) != 0) {
			quorum_stats.errorReads++;
			clulog(LOG_ERR, "get_partner_status: Failed to read "
			       "partner's status block.\n");
		} else {
			quorum_stats.goodReads++;
			return 0;
		}
	}

	return -1;
}

/*
 * This dreaded routine is called when we want to shoot the partner node
 * by toggling its power switch.  Here we also mark the partner's on-disk
 * state to denote that the node is down.  That will cause the Service
 * Manager to pickup the services formerly running on the node we just shot.
 *
 * Returns: 0 - the partner is safely out of the picture and its now safe
 *		for the service manager to takeover services.
 *	   -1 - shootdown failed, this will likely result in service
 *		disruption as they can't be successfully taken over.
 */
static int
shoot_partner(void)
{
	int retval;
	time_t timeNow;

	clulog(LOG_WARNING, "shoot_partner: attempting to shoot partner.\n");

	/*
	 * First check to see if the power switch is operational.
	 */

	retval = checkPowerSwitch();
	switch (retval) {
	case 0: // Success
		retval = clu_powerCyclePartner();
		if (retval != 0) {
			if (retval == -2) {
				clulog(LOG_ERR, "shoot_partner: comm error "
				       "to powerd.\n");
			} else {
				/*
				 * Here we just got a successful power switch 
				 * status, but then got a failure to power 
				 * cycle the partner.  This is an unstable 
				 * situation that we can't safely continue to 
				 * operate under.  Remove ourself from the 
				 * cluster.
				 */
				shut_myself_down("Cluster Instability: power "
						 "switch malfunction.");
				/* NOTREACHED */
			}
			return(-1);
		}
		clulog(LOG_ERR, "shoot_partner: successfully shot partner.\n");
		break;
	case EIO:
		/*
		 * We have connectivity to the power switch but it is reporting
		 * error status.  So we can presume that the partner node is 
		 * probably powered on but we can't shoot him.  In this case
		 * we can't safely takeover any services.
		 */
		clulog(LOG_EMERG, "shoot_partner: switch in error state, can't"
		       " takeover services.\n");
		break;
	case ECONNREFUSED:
		/*
		 * Unable to exchange message with powerd to ascertain switch's
		 * status.  If you can't even ask powerd for status; its a 
		 * sure bet you can't send it a command to actually do the 
		 * power cycle.
		 */
		clulog(LOG_EMERG, "shoot_partner: Unable to communicate "
		       "with powerd.\n");
		break;
	case ETIMEDOUT:
		/*
		 * We can't talk to the power switch. This could mean 2 things:
		 * Case1: The power switch's power cord got pulled or otherwise
		 *        lost power.  Consequently the partner node can be 
		 *        presumed to be powered off also.  In this case we 
		 *        can safely takeover services.
		 * Case2: Our serial cable to the partner node has been 
		 *        disconnected.  In this case the partner could still 
		 *        have power.  In an attempt to distinguish these 2 
		 *        cases, we look to see when we last had contact with 
		 *        the power switch.  If we have successfully retrieved
		 *        status from the power switch "recently" then we will
		 *        assume that its Case1 and not Case2.  While this does
		 *        open up a vulnerability risk, the alternative would 
		 *        be to not takeover any services under the Case1 
		 *        scenario - which is of much higher probability that 
		 *        Case2 when we've had recent switch communication.
		 */
		timeNow = time(NULL);
		if ((timeNow - lastPowerSwitchStatus) < 
		                           recentPowerSwitchConnectivity) {
			clulog(LOG_CRIT, "shoot_partner: can't contact switch, but did recently, takeover services.\n");

			if(clu_clear_partner_state()) {
				clulog(LOG_CRIT, "shoot_partner: bad ret from clu_clear_partner_state\n");
				shut_myself_down("shoot_partner: bad ret from clu_clear_partner_state\n");
			} 
			retval = 0;
		} else {
			clulog(LOG_EMERG, "shoot_partner: can't contact switch,"
			       " don't takeover services.\n"); 
		}
		break;
	default:
		clulog(LOG_ERR, "shoot_partner: logic error, unknown status, "
		       "%d.\n", retval); 
		break;
	}

	if (retval == 0) {
		clulog(LOG_NOTICE, "shoot_partner: successful "
		       "partner shootdown.\n"); 
		return 0;
	}
	return -1;
}
/*
 * Signal handler for term signal.  Its not a good idea to just exit
 * quorumd as that would leave the shared state marked as up; leaving the
 * node vounerable to a shootdown.  Here we initiate a complete cluster
 * shutdown by sending a host down signal to service manager.  This will
 * cause SM to stop all services and send a message back to quorumd upon
 * completion.  After recieving that, quorumd will complete its own clean
 * shutdown.
 */
static void
quorumd_sigterm_handler(int __attribute__ ((unused)) arg)
{
	clulog(LOG_WARNING, "quorumd_sigterm_handler: initiate cluster stop.\n");
	if (notify_sm(myNodenum, NODE_DOWN) != 0) {
		clulog(LOG_ERR, "quorumd_sigterm_handler: failed to notify SM "
		       "we're down.\n");
	}
	signal(SIGTERM, quorumd_sigterm_handler);
	return;
}
/*
 * Signal handler for hup signal. This causes a re-read of the configuration
 * parameters.
 */
static void
quorumd_sighup_handler(int __attribute__ ((unused)) arg)
{
	// Delete prior memory-resident config table to force re-read.
	CFG_Destroy();
	establishConfigParams(1);
	signal(SIGHUP, quorumd_sighup_handler);
	return;
}
/*
 * Signal handler for usr1 signal. Prints runtime statistics to log.
 */
static void
quorumd_sigusr1_handler(int __attribute__ ((unused)) arg)
{
	printQuorumStats();
	signal(SIGUSR1, quorumd_sigusr1_handler);
	return;
}

/*
 * External interface into the quorum daemon used to ask it to return a node's
 * status.  Called by Service Manager.
 * Returns: a node state as defined in the cluster header file. -1 on error.
 */
int
getNodeState(int nodeNum)
{
	int retval;

	if ((nodeNum < 0) || (nodeNum > MAX_NODES)) {
		clulog(LOG_ERR, "getNodeState: Invalid node number %d.\n", 
		       nodeNum);
		return(-1);
	}
	/*
	 * This routine is only called with our node number after we've
	 * determined node membership, and have joined the cluster.
	 */
	if (nodeNum == myNodenum) {
		return(NODE_UP);
	}
	/*
	 * Now we need to return the partner's status.  Here it just returns
	 * the latest state as of the most recent disk pinging.  It doesn't
	 * perform an explicit polling operation because we don't want to
	 * get in the way of the quorumd's time constriants.
	 */
	retval = partner_status->state;
	return(retval);
}


#define DEVRANDOM     "/dev/random"
#define DEVURANDOM    "/dev/urandom"

static int
gen_random(void *buf, ssize_t sz)
{
	long *bufp = buf;
	int dev_random=-1;
	unsigned int i=0;
	int seed;

	dev_random = open(DEVRANDOM, O_RDONLY);
	if (dev_random < 0) {
		return -1;
	}

	seed = read(dev_random, &seed, 1);
	close(dev_random);
	if (seed < 0) {
		return -1;
	}
	srandom((unsigned int)seed);

	while (i < sz/sizeof(long)) {
		bufp[i] = random();
		i++;
	}
	return 0;
}

/*
 * Only called when running in a configuration which does not include
 * remote power switches.  When power switches are used, if a node hangs
 * it will get shot as an I/O Barrier.  Without a power switch you risk
 * data corruption in the event that a node hangs and then resumes after 
 * the other system has taken over the services.
 *
 * So to try to minimize this window of vounerability where the unhung system
 * resumes, perform a check to see if the other system has done a hostile
 * takeover of services.  The way to detect this is by examining this node's
 * on-disk state.  The only time that would ever be in the DOWN state while
 * the quorumd's main polling loop is running would be if the other system
 * has marked it as such.  In this case if the configuration did have
 * power switches a shootdown would have occurred.  If a system sees its
 * state as down, shutdown as quickly as possible to close down the window
 * where the same filesytem/partition is concurrently mounted by 2 cluster
 * members.  A normal clean shutdown would take too long; resulting in
 * filesystem buffer cache drains, etc.  Avoid all that vounerability by
 * just causing a system panic (do not pass Go, do not collect $200).
 */
static void
quorumd_hang_check(void) {
	int retries = 0;

	if (myStatus->state != NODE_UP) {
		clulog(LOG_ERR, "quorumd_hang_check: Called while I'm not up!\n");
		return;
	}

	while (retries++ < max_consecutive_io_errors) {
		if (readStatusBlock(offsetOfMyStatusBlock, 
				    myReadStatus, 1) != 0) {
			quorum_stats.errorReads++;
			clulog(LOG_ERR, "quorumd_hang_check: Failed to read "
			       "my status block.\n"); // we got bigger problems
		} else {
			quorum_stats.goodReads++;
			break;
		}
	}
	if (myReadStatus->state == NODE_DOWN) {
		clulog(LOG_CRIT, "quorumd_hang_check: Halting due to unclean "
				"service takeover!\n");
		reboot(RB_AUTOBOOT);
	}
	return;
}

static void
quorumd_body(void)
{
	int        timestamp_unchanged = 0,
		   changed_times = 0,
		   max_same_timestamp = max_same_timestamp_netup;
	int        ret = -1;
	time_t     start_time, elapsed_time, last_partner_timestamp = 0;
	static     int proceed = 1;

	while (1) {
		start_time = time(NULL);

		if (power_switch_type == SWT_NONE) {
			quorumd_hang_check();
		}

		/*
		 * The following two calls can only fail due to I/O errors.
		 */
		if (update_ondisk_status() < 0) {
			shut_myself_down("Unable to update timestamp on "
					 "shared storage partition.\n");
		}
	
		if (get_partner_status() < 0) {
			shut_myself_down("Unable to read partner's timestamp"
					 " from shared storage partition.\n");
		}

		/*
		 * Sanity Check...
		 */
		if (evil_incarnation_num && 
		        (partner_status->incarnationNumber == evil_incarnation_num)) {
			clulog(LOG_EMERG, "quorumd_body: resurrected node has "
			       "initiated I/O!\n");
			shut_myself_down("Cluster Instability: shot node "
					 "resurrected\n");
			/* NOTREACHED */
			return;
		}

		if (partner_status->state == NODE_UP) {
			clulog(LOG_DEBUG, "quorumd_body: last_time: 0x%lx, "
			       "this_time: 0x%lx\n", last_partner_timestamp,
			       partner_status->timestamp);

			if (last_partner_timestamp == partner_status->timestamp) {
				timestamp_unchanged++;
				if (timestamp_unchanged >= 
				    max_same_timestamp_netdown) {
					max_same_timestamp = check_hb_active();
				}
			} else {
				timestamp_unchanged = 0;
				last_partner_timestamp = 
					            partner_status->timestamp;

				if (prior_partner_state != NODE_UP &&
				                       changed_times++ >= 2) {
					partner_state = NODE_UP;
				} else {
					/*Look for config change notification*/
					configChangeCheck(
					      partner_status->configTimestamp);
				}
			}
		} else if ( (partner_status->state == NODE_DOWN) ||
			    (partner_status->state == NODE_UNINITIALIZED) ) {
			/*
			 * Partner cleanly shutdown.
			 */
			partner_state = NODE_DOWN;
			timestamp_unchanged = 0;
		}

		if (timestamp_unchanged > max_same_timestamp) {
			time_t suspect_incarnation_num;
			/*
			 * Hung node.  Issue I/O barrier.
			 * First grab the incarnation number of the failed
			 * node. Upon successful shoot remember this failing
			 * incarnation number in "evil_incarnation_num".  If
			 * we encounter any future disk status block updates
			 * with this incarnation number we know that the
			 * shootdown code failed and the node we consider
			 * dead must still be alive.  If this ever happens
			 * we remove ourself from the cluster.
			 */
			suspect_incarnation_num = 
				partner_status->incarnationNumber;
			ret = shoot_partner();
			if (ret == 0) { /* success */
				partner_state = NODE_DOWN;
				last_partner_timestamp = 0;
				evil_incarnation_num = 
					suspect_incarnation_num;
				timestamp_unchanged = 0;
			} else {
				/*
				 * Log that we will be unable to failover
				 * services.
				 */
				clulog(LOG_EMERG, "quorumd_body"
				       "[%d]: ERROR: Unable to "
				       "shootdown partner.\n", myNodenum);
				clulog(LOG_EMERG, "quorumd_body"
				       "[%d]: ERROR: Services may be "
				       "unavailable.\n", myNodenum);
			}
		}

		if (partner_state != prior_partner_state) {
			/*
			 * Don't try to notify SM if it hasn't yet notified
			 * us that its alive.  This prevents the log file
			 * from getting cluttered up with the following
			 * error message.
			 */
			if (sm_daemon_pid == (pid_t)0) {
			 	ret = 0;
				clulog(LOG_DEBUG, "quorumd_body: skipping SM "
					"member state change notification "
					"until its up.");
			}
			else {
				ret = notify_sm(partnerNodenum, partner_state);
			}
			if (ret < 0) {
				clulog(LOG_ERR, "quorumd_body: Failed "
				       "to notify Service Manager of "
				       "node state change: node %d, "
				       "state %d\n.", partnerNodenum,
				       partner_status->state);
			}
			else {
				/*
				 * Defer setting prior partner state until
				 * SM has been notified.  This will force
				 * message resend of notification.
				 */
				prior_partner_state = partner_state;
			}
		}

		/*
		 * Now that we've taken care of our membership business,
		 * we can make sure the other cluster daemons are running.
		 * After that, we'll even check for phone messages.
		 */
		if (monitor_cluster_daemons() < 0) {
			/*
			 * If the other node is alive, we'll reboot.  
			 * Otherwise, we hold tight.  We don't want to 
			 * decrease availability.
			 */
#ifdef TIMFIX 
			/* XXX - took this out. Doesn't appear to be cleanly
			 * shutting down the node, consequently the partner
			 * initiates a shoot which got into an error case.
			 */
			consider_reboot();
#else /* TIMFIX */
			clulog(LOG_ERR, "quorumd_body: SM reboot not yet implemented.\n");
#endif /* TIMFIX */
		}

		quorumd_regcheck(start_time);

		elapsed_time = time(NULL) - start_time;
		if (elapsed_time < quorumd_interval) {
			scanNextCategory(categoryScanDelay);
			if (!(process_requests(quorumd_interval - elapsed_time))) { 
				proceed = 0;
			}
		} else {
			clulog(LOG_WARNING, "quorumd_body: Spent too much time"
			       " updating on-disk data.  Processing of pending"
			       " connections will have to wait.\n");
		}

		if (!proceed) {
			ret = stop_quorumd(0/*complain*/);
			if (ret != 0) {
				clulog(LOG_ERR, "quorumd_body: failing stop "
					"until SM exits first.\n");
				proceed = 0;
			}
			else {
			        if (listenFD >= 0) {
            				msg_close(listenFD);
					listenFD = -1;
        			}                    
				clulog(LOG_DEBUG, "quorumd_body: cleanly stopped.\n");
				exit(0);
			}
		}

	} /* while */
}

/*
 * This function just checks to see if the partner is up or down.
 * We do this by duplicating some of the functionality that already
 * exists in the quorumd_body function.
 *
 * RETURN VALUES
 *   1 - partner is a member
 *   0 - partner is not a member
 *  -1 - partner thinks he's a member, but we should really shoot him.
 *  -2 - Unrecoverable error.  Don't start cluster daemons.
 */
static int
is_partner_member(void)
{
	int   last_partner_timestamp = 0;
	int   timestamp_unchanged = 0;
	int   changed_times = 0;
	int   ret=-2, i = 0;

	if (update_ondisk_status() < 0) {
		/*
		 * This can only fail due to an IO error.
		 */
		clulog(LOG_CRIT,"is_partner_member: Unable to update our "
		       "on-disk status block.\n");
		return(ret);
	}

	if (get_partner_status() < 0) {
		/*
		 * This can only fail due to an IO error.
		 */
		clulog(LOG_CRIT,"is_partner_member: Unable to read partner's "
		       "on-disk status block.\n");
		return(ret);
	}

	if (partner_status->state == NODE_UP) {
		/*
		 * Here we check to see that the timestamp for our
		 * partner is updating.
		 */
		last_partner_timestamp = partner_status->timestamp;
		clulog(LOG_INFO, "Determining partner's status.  This "
		       "could take a while.\n");
		for (i = 0; i < max_same_timestamp_netup * 2; i++) {
			sleep(quorumd_interval);
			if (update_ondisk_status() < 0) {
				/*
				 * This can only fail due to an IO error.
				 */
				clulog(LOG_CRIT,"is_partner_member: Unable to "
				       "update our on-disk status block.\n");
				return(ret);
			}
			if (get_partner_status() < 0) {
				clulog(LOG_CRIT,"is_partner_member: Unable to"
				    " read partner's on-disk status block.\n");
				return(ret);
			}
			if (partner_status->state != NODE_UP) {
				return 0;
			}
			if (last_partner_timestamp == partner_status->timestamp)
				timestamp_unchanged++;
			else
				changed_times++;

			if (changed_times > 2) {
				clulog(LOG_INFO, "is_partner_active: Partner "
				      "is actively updating his timestamp.\n");
				break;
			}
		}
		if (timestamp_unchanged > max_same_timestamp_netup) {
			clulog(LOG_ERR, "is_partner_member: Partner is "
			       "inactive.\n");
			ret = -1;
			return(ret);
		}
		clulog(LOG_INFO, "is_partner_member: Partner is up and running"
		       ".  Starting cluster...\n");
		partner_state = NODE_UP;
		ret = 1;
	} else {
		/*
		 * Basically, do nothing.
		 */
		clulog(LOG_INFO, "is_partner_active: Partner is cleanly "
		       "stopped.  Generate session id and start crankin'.\n");
		ret = 0;
	}

	return(ret);
}



/*
 * Spawn Cluster Daemons
 */
static int
spawn_daemon(const char *daemon_name)
{
	pid_t dmn_pid=-1;
#ifndef DEBUG
	int   status;
#endif

	clulog(LOG_INFO, "spawn_daemon: starting %s.\n", daemon_name);
	if ((dmn_pid = fork()) == 0) {
		if (strstr(daemon_name, SVCMGR_DAEMON)) {
			sm_daemon_fork_pid = dmn_pid;
		}
		/*
		 * Child.  Now Exec.
		 */
		execl(daemon_name, daemon_name, 0);
		/*
		 * Should not reach here.
		 */
		exit(1);
	} else if (dmn_pid < 0) {
		/*
		 * What the fork?
		 */
		return -1;
	}
	/*
	 * The first thing this daemon should do is disassociate itself from
	 * its controlling terminal and fork into the background.
	 */
#ifndef DEBUG
	waitpid(dmn_pid, &status, 0);
        if (WIFEXITED(status))
                return (WEXITSTATUS(status));
        else
                return -1;
#else
	return 0;
#endif
}


static int
spawn_all_daemons()
{
	int ret=0;
	int ourRet=0;
	char daemon_path[MAXPATHLEN];

	memset(daemon_path, 0, MAXPATHLEN);
	strncat(daemon_path, BINDIR, MAXPATHLEN);

	/* setup the path to the daemon */
	snprintf(daemon_path, MAXPATHLEN, "%s/%s", BINDIR, SVCMGR_DAEMON);
	ret = spawn_daemon(daemon_path);
	if (ret) {
		clulog(LOG_CRIT,"spawn_daemons: Failed to start service "
		       "manager.\n");
		clulog(LOG_CRIT, "spawn_daemons: Service Manager "
		       "exit code %d\n", ret);
		/*
		 * If we can't start the service manager, we may as well
		 * not start the cluster daemons.
		 */
		return -2;
	}

	snprintf(daemon_path, MAXPATHLEN, "%s/%s", BINDIR, POWER_DAEMON);
	ret = spawn_daemon(daemon_path);
	if (ret < 0) {
		clulog(LOG_CRIT, "Unable to start powerd.  This means we "
		       "cannot issue I/O barriers, and will not be able to "
		       "fail over services.\n");
		ourRet = ret;
	}

	snprintf(daemon_path, MAXPATHLEN, "%s/%s", BINDIR, HEARTBEAT_DAEMON);
	ret = spawn_daemon(daemon_path);
	if (ret < 0) {
		clulog(LOG_ERR, "spawn_daemon: Failed to start heartbeat.\n");
		clulog(LOG_ERR, "spawn_daemons: heartbeat exit status: %d\n",
		       ret);
		ourRet = ret;
	}

	return ourRet;
}


void
printQuorumStats(void)
{
	clulog(LOG_DEBUG, "Quorumd Statistics:\n");
	clulog(LOG_DEBUG, "Writes: Good=%ld, Errors=%ld\n",
	       quorum_stats.goodWrites, quorum_stats.errorWrites);
	clulog(LOG_DEBUG, "Reads : Good=%ld, Errors=%ld\n",
	       quorum_stats.goodReads, quorum_stats.errorReads);
	clulog(LOG_DEBUG, "Quorumd Duration: Max=%ld, Overruns=%ld\n",
	       quorum_stats.maxQuorumdDuration, quorum_stats.tooLongQuorumd);
	printRawIOStats();
}


/*
 * Setup configurable parameters.  This will establish default values and
 * potentially override them based on settings in the cluster configuration
 * file.
 * Paramater: hup - set to nonzero when called out of the signal handler.
 *	      This allows for a distinction of parameters which can be set
 *	      only at startup time vs those which can be changed at runtime.
 * Returns: 0 - success.
 */
static int
establishConfigParams(int __attribute ((unused)) hup)
{
	char *param;
	int retval = 0;

	/*
	 * There is a utility routine which reads in the logging level and
	 * makes the appropriate calls to the logging library  to set that 
	 * level.
	 */
	getVerboseLevel();

	if (CFG_Get((char *) CFG_DISK_PING_INTERVAL, NULL, &param) == CFG_OK) {
		if (param) {
			quorumd_interval = atoi(param);
			clulog(LOG_DEBUG, "Overriding quorumd interval "
			       "to %d.\n", quorumd_interval);
		}
	}

	if (CFG_Get((char *)CFG_DISK_SAMETIME_NETUP, NULL, &param) == CFG_OK) {
		if (param) {
			max_same_timestamp_netup = atoi(param);
			clulog(LOG_DEBUG,"Overriding same time netup to %d.\n",
			       max_same_timestamp_netup);
		}
	}

	if (CFG_Get((char *)CFG_DISK_SAMETIME_NETDOWN,NULL,&param) == CFG_OK) {
		if (param) {
			max_same_timestamp_netdown = atoi(param);
			clulog(LOG_DEBUG, "Overriding same time netdown to "
			       "%d.\n", max_same_timestamp_netdown);
		}
	}

	if (CFG_Get((char *) CFG_DISK_SCAN_DELAY, NULL, &param) == CFG_OK) {
		if (param) {
			categoryScanDelay = atoi(param);
			clulog(LOG_DEBUG, "Overriding category scan delay to "
			       "%d.\n", categoryScanDelay);
		}
	}

	return(retval);
}

/*
 * This function intentionally inlines a bunch of functionality, mainly
 * b/c it makes the initial cluster startup much more readable, and easy
 * to follow.  Also, some of the functionality here we don't want to export.
 */
static int
start_quorumd()
{
	int   nodeNumber, ret = -1;
	int   last_partner_timestamp = 0;
	int   retval = -1;
	char  netblk[SPACE_NET_BLOCK_DATA];
	CluCfg *cfg;

	/*
	 * Configuration check to verify that the network interfaces
	 * are mapped properly.
	 */
        cfg = get_clu_cfg(CLU_CONFIG_FILE);
	if (cfg == NULL) {
	    switch (errno) {
		case ENOMEM:
		    clulog(LOG_ERR, "start_quorumd: System out of memory.\n");
		    break;
		case ENOENT:
		    clulog(LOG_ERR, "start_quorumd: Failed parsing config file.\n");
		    break;
		case EFAULT:
		    clulog(LOG_ERR, "start_quorumd: Invalid network configuration.\n");
		    clulog(LOG_ERR, "start_quorumd: Unable to associate "
		       "heartbeat channels with configured network adapters.\n");
		    break;
	    }
	    return(-1);
	}
	else {
	    free(cfg);
	}

	/*
	 * Set logging level.  This will pertain just through the bootstrap
	 * sequence.  Later in the startup process the logging level will
	 * be set per the configuration settings.
	 */
        clu_set_loglevel(LOG_NOTICE);

	if (quorumd_registered() < 0) {
		return(-1);
	}

	/*
	 * Start out by doing some validation on the device special file
	 * in the configuration file describing the primary and backup
	 * quorum partition.  If we can't open them up then there's no
	 * use in proceeding any further.
	 */

	clu_msg_based_shoot = 0;

	if (initSharedFD() != 0) {
		clulog(LOG_ERR, "start_quorumd: Unable to open specified "
		       "quorum partitions.\n");
		clulog(LOG_ERR, "start_quorumd: Please check your "
		       "configuration settings.\n");
		return(-1);
	}
	clulog(LOG_DEBUG, "start_quorumd: successfully validated quorum "
	       "partitions.\n");


	myStartupTimestamp = time(NULL); // Records cluster startup time.

	nodeNumber = cluGetLocalNodeId();
	if ((nodeNumber < 0) || (nodeNumber > MAX_NODES)) {
		clulog(LOG_ERR, "start_quorumd: Invalid node number %d.\n", 
		       nodeNumber);
		return -1;
	}

	/*
	 * Perform a sanity check on the partition offsets.
	 */
	if (offsetParanoiaCheck()) {
		clulog(LOG_ERR, "start_quorumd: partition offset inconsistency.\n");
		return -1;
	}

	/*
	 * Determine the type of power switch being used.
	 */
	power_switch_type = PWR_type();

	/*
	 * First setup global state variables and the in-memory version of the
	 * node's state, that later gets written out to disk in the quorumd.
	 */
	myNodenum = nodeNumber;
	offsetOfMyStatusBlock = (OFFSET_FIRST_STATUS_BLOCK + 
				 (nodeNumber * SPACE_PER_STATUS_BLOCK));
	/*
	 * Establish rawio bounce IO buffer support.
	 */
	if (initAlignedBufStuff() < 0) {
		clulog(LOG_ERR, "start_quorumd: unable to init rawio"
		       " support.\n");
		return -1;
	}
	myStatus = (NodeStatusBlock *)allocAlignedBuf();
	if (myStatus == MAP_FAILED) {
		clulog(LOG_ERR, "start_quorumd: unable to allocate aligned "
		       "status buffer.\n");
		return -1;
	}
	partner_status = (NodeStatusBlock *)allocAlignedBuf();
	if (partner_status == MAP_FAILED) {
		clulog(LOG_ERR, "start_quorumd: unable to allocate aligned "
		       "partner status buffer.\n");
		return -1;
	}
	myReadStatus = (NodeStatusBlock *)allocAlignedBuf();
	if (myReadStatus == MAP_FAILED) {
		clulog(LOG_ERR, "start_quorumd: unable to allocate aligned "
		       "read status buffer.\n");
		return -1;
	}

	/*
	 * Sets up our incarnation number, and initializes our state
	 * to NODE_DOWN.
	 */
	node_status_init_state(myStatus);
	node_status_init_state(myReadStatus);
	memset(&quorum_stats, 0, sizeof(QuorumStats));
	/*
	 * Kludge to figure out the partner's node number, falls
	 * apart for more than 2 nodes.
	 */
	partnerNodenum = myNodenum ^ 1;
	offsetOfPartnerStatusBlock = (OFFSET_FIRST_STATUS_BLOCK + 
				      (partnerNodenum * SPACE_PER_STATUS_BLOCK));

	/*
	 * Initialize the random session id, so we don't have to do this
	 * in a time-critical path later.
	 */
	memset(netblk, 0, SPACE_NET_BLOCK_DATA);
	gen_random(netblk, SPACE_NET_BLOCK_DATA);
	/*
	 * Now make sure we can access the shared SCSI disk.
	 */
	clu_lock();
	myStatus->state = NODE_DOWN;
	if (update_ondisk_status() != 0) {
		clu_unlock();
		return -1;
	}
	clu_unlock();

	/*
	 * Now the fun begins...
	 */
	ret = is_partner_member();
	switch (ret) {

	case -1:  /* Partner is hung */
		/*
		 * Shoot the partner... then continue on to case 0.
		 */
		clulog(LOG_ERR, "start_quorumd: Partner is hung.  Proceeding "
		       "with shoot-down.\n");
		/*
		 * Here we issue the low-level I/O calls ourselves.  We
		 * cannot rely on the power daemon, as it hasn't been started.
		 */
		if (clu_powerCyclePartner() < 0) {
			clulog(LOG_CRIT, "Unable to issue I/O barrier.  Not"
			       " starting cluster daemons.\n");
			return -1;
		}
	        /*
		 *  Now we mark our partner as down.
		 */
	        clulog(LOG_NOTICE, "start_quorumd: Successfully shot partner."
		       "  Setting partner's node state to down.\n");
		partner_status->state = NODE_DOWN;
		last_partner_timestamp = 0;
		evil_incarnation_num = partner_status->incarnationNumber;

	case 0:  /* Partner is not a member */
	case 1:  /* Partner is a member */
		/*
		 *  These two cases are handled the same way to avoid a race
		 *  condition in which the partner node comes up immediately
		 *  after we decide he is down, then proceeds to update the
		 *  session id on the shared storage.
		 */
		clu_lock();
		myStatus->state = NODE_UP;
		if (update_ondisk_status() != 0) {
			clu_unlock();
			return -1;
		}
		if (get_partner_status() < 0) {
			clu_unlock();
			return -1;
		}
		if (partner_status->state == NODE_UP) {
			/*
			 * Partner initialized the session id while we
			 * weren't looking.
			 */
			clu_unlock();
			break;
		}

		retval = setNetBlockData(netblk);
		if (retval < 0) {
			clu_unlock();
			return -1;
		}
		clu_unlock();
		break;

	case -2:  /* Unrecoverable error, exit */
	default:
		return -1;
	} /* switch(ret) */

	/*
	 * The configuration parameters are read in AFTER we establish ourself
	 * as a cluster member. Must be done after lock initialization.
	 */
	if (establishConfigParams(0) != 0) {
		clulog(LOG_ERR, "start_quorumd: unable to establish config "
		       "params.\n");
		return -1;
	}
        /*
         * Set the timestamp associated with the configuration database in our
         * status block.
         */
        notifyMemberConfigChange();

	listenFD = msg_listen(PROCID_QUORUMD);
	if (listenFD < 0) {
		clulog(LOG_CRIT, "start_quorumd: Unable to initialize "
		       "message services.  Not starting cluster daemons.\n");
		return -1;
	}


	/*
	 * Now spawn the cluster daemons.
	 */
	ret = spawn_all_daemons();
	if (ret < 0) {
		clulog(LOG_ERR, "Error starting cluster daemons.\n");
		if (ret == -2) {
			clulog(LOG_CRIT, "Not starting the"
			       " cluster software.\n");
		} else {
			clulog(LOG_ERR, "Trying to continue.\n");
		}
	}

	/* Run quorumd in context of calling process. */

	clu_msg_based_shoot = 1;
	quorumd_body();

	return(0);
}


/*
 * Called to terminate the quorum daemon sub-process.
 * Args: complain - prints error messages if the daemon isn't currently
 *		    running and you attempt to stop it.
 * Returns: 0 on success.  It will return success even if there is no
 * 	    quorumd currently running.
 *	    1 - unable to safely exit because SM is running.
 */
static int
stop_quorumd(int complain)
{
	generic_msg_hdr  msghdr;
	PswitchStatusMsg pswitch_msg;
	msg_handle_t     con=-1, ret=-1;
	int              auth=0, hbmsg = 0;
	int		 retval;

	/*
	 * The cluster shutdown sequence must first stop svcmgr in order to
 	 * cause it to stop all services and thereby free up any associated
	 * resources.  Consequently it is not safe to stop the cluster if
	 * svcmgr hasn't been cleanly stopped first (which waits around for
	 * all of the services to cleanly stop).
	 */
	if ((sm_daemon_pid != 0) || (sm_daemon_fork_pid != 0)) {
		if (sm_daemon_pid == 0) {
			/*
			 * Covers a race condition where a cluster stop is
			 * initiated at a point where SM has started up, but
			 * not yet sent an DISK_SM_ALIVE message.
			 */
			sm_daemon_pid = sm_daemon_fork_pid;
		}
	    	retval = check_pid_valid(sm_daemon_pid,SM_COMMAND_NAME);
               	if (retval == 1) {
			/*
			 * The way we handle this race condition is to
			 * simply send a message to SM indicating that this
			 * node is going down.  In turn, SM will stop all
			 * services on this node and then when its done,
			 * send quorumd a stop message.  This necessitates
			 * having quorumd not exit the messaging loop for
			 * this particular stop message, but rather to wait
			 * around for the one from SM.
			 */
                        clulog(LOG_ERR, "stop_quorumd: SM is running.\n");
        		if (notify_sm(myNodenum, NODE_DOWN) != 0) {
                		clulog(LOG_ERR, "stop_quorumd: failed to notify"
                       		" SM we're down.\n");
        		}                                     
                        clulog(LOG_ERR, "stop_quorumd: sent node down to SM.\n");
			return(1);
                } 
	}
	clulog(LOG_DEBUG, "stopQuorumd: stopping quorum daemon.\n");
	pid_quorumd = -1;

	/*
	 * Stop listening for incoming connections.  We wanna go home.
	 */
	clulog(LOG_DEBUG, "stop_quorumd: closing listendFD.\n");
	if (listenFD > 0) /* for sanity sake */
		close(listenFD);

	/*
	 * Mark the disk state to be down so that the partner node recognizes
	 * this as a clean shutdown (and subsequently, won't shoot us!).
	 * Do this after killing off the quorumd so that it doesn't change the
	 * state field to active behind our back.
	 */
	clulog(LOG_DEBUG, "stopQuorumd: Set my disk state to DOWN.\n"); 
	clu_lock();
	myStatus->state = NODE_DOWN;
	ret = update_ondisk_status();
	clu_unlock();
	if (ret != 0) {
		/*
		 * If we fail to mark our state as down, it should still 
		 * proceed to kill off the quorumd daemon.  In this scenario 
		 * we'll be initiating our own shutdown and/or getting shot.
		 */
		clulog(LOG_EMERG, "stopQuorumd: unable to mark disk "
		       "state to DOWN.\n");
	}

	freeAlignedBuf((char *)myStatus);
	freeAlignedBuf((char *)partner_status);
	freeAlignedBuf((char *)myReadStatus);
	myStatus = MAP_FAILED;
	deinitAlignedBufStuff();

	/*
	 * Tell heartbeat to terminate
	 */
	memset(&hbmsg, 0, sizeof(hbmsg));
	con = msg_open(PROCID_HEARTBEAT, myNodenum);
	if (con > 0) {
		hbmsg = HB_TERM;
		(void)msg_send(con, &hbmsg, sizeof(hbmsg));
		msg_close(con);
	}

	/*
	 * Stop the power daemon.
	 */
	memset(&pswitch_msg, 0, sizeof(pswitch_msg));
	memset(&msghdr, 0, sizeof(msghdr));

	con = msg_open(PROCID_POWERD, myNodenum);
	if (con > 0) {
		fill_generic_hdr(&msghdr, PSWITCH_TERM, 0);
		(void)msg_send(con, &msghdr, sizeof(msghdr));
		memset(&pswitch_msg, 0, sizeof(pswitch_msg));
		ret = msg_receive_timeout(con, &pswitch_msg, sizeof(pswitch_msg)
						, &auth, 10);
		msg_close(con);
		if (ret <= 0) {
			clulog(LOG_ERR, "stop_quorumd: can't get reply"
		       	" from powerd.\n");
			return(ECONNREFUSED);
		}
		msg_close(con);
	}

	clulog(LOG_DEBUG, "stopQuorumd: completed stop.\n");
	clulog_close();

	/*
	 * Close all open connections.
	 */
	close_open_comms();
	return(0);
}


int
main(int __attribute__ ((unused)) argc, char **argv)
{
	/*
	 * Currently no command line options.
	 */

#ifndef DEBUG
	/*
	 * daemon_init will verify that there is only one instance of quorumd
	 * running; that the caller who initiated it is root; lock in memory
	 * and elevate priority.
	 */
	daemon_init(argv[0]);
#endif
	/*
	 * Start up the quorum daemon which will run forever in a while loop 
	 * until it recieves a stop message.
	 */
	if (start_quorumd() != 0) {
		clulog(LOG_EMERG, "Unable to start quorum daemon.\n");
		exit(1);
	}

	return(0);
}

/*
 * Notify local daemons that a configuration change has occurred.
 * (Considered implementing as a message send, but sending a signal is
 * asynchronous, lighter weight and not dependent on the daemon being
 * in a position where its waiting around for messages.)
 */
static int
notifyLocalDaemons(void) 
{
    quorumd_sighup_handler(0);
    if (sm_daemon_pid > (pid_t)0) {
	clulog(LOG_DEBUG, "notifyLocalDaemons: notify SM at pid %d.\n", sm_daemon_pid);
        kill(sm_daemon_pid, SIGHUP);
    }
    if (powerd_daemon_pid > (pid_t)0) {
	clulog(LOG_DEBUG, "notifyLocalDaemons: notify powerd at pid %d.\n", powerd_daemon_pid);
        kill(powerd_daemon_pid, SIGHUP);
    }
    if (hb_daemon_pid <= 0) {
	/* SM & powerd tells quorumd what its pid is, but when we need it
	 * we have to figure out hb's pid ourselves.
         */
        hb_daemon_pid = get_daemons_pid(HEARTBEAT_DAEMON);
    }
    if (hb_daemon_pid > (pid_t)0) {
	clulog(LOG_DEBUG, "notifyLocalDaemons: notify hb at pid %d.\n", hb_daemon_pid);
        kill(hb_daemon_pid, SIGHUP);
    }
    return(0); 
}
/*
 * Notify partner cluster member that a configuration change has occurred.
 * This is accomplished by updating the configuration timestamp in our
 * node status block and expecting the other cluster member to notice that
 * it has changed.
 */
static int
notifyMemberConfigChange(void) 
{
    char *time_string;
    time_t newtime;

    // Extract configuration timestamp from "database"
    CFG_Get("cluster%timestamp", 0, &time_string);
    if (NULL == time_string) {
	clulog(LOG_DEBUG, "notifyMemberConfigChange: can't get timestamp of config db.\n");
	return(-1);
    }
    newtime = (time_t)atol(time_string);
    // Sanity check to verify that the timestamp is in fact newer
    if (verifyTimestampNewer(newtime) < 0) {
	return(-1);
    }
    // Adjust the configuration timestamp setting in status block
    updateConfigTimestamp(newtime);
     
    return(0); 
}
/*
 * Called in response to reciept of a message from the configuration
 * utilities indicating a change in the configuration stored in the
 * "configuration database" portion of the quorum partition.
 * In response to this, quorumd is responsible for:
 *	1) Notifying all the daemons on this local node
 *	2) Notifying quorumd on the other cluster member (which in turn
 *	   notifies the daemons on that node)
 */
void
processConfigChange(void)
{
    int retval;

    retval = notifyLocalDaemons();
    if (retval < 0) {
	clulog(LOG_ERR, "processConfigChange: unable to notify daemons of config change.\n");
    }
    retval = notifyMemberConfigChange();
    if (retval < 0) {
	clulog(LOG_ERR, "processConfigChange: unable to notify partner of config change.\n");
    }
}
/*
 * Update our own status block field to reflect new timestamp of 
 * configuration database.
 */
static void updateConfigTimestamp(time_t tstamp) 
{
    myStatus->configTimestamp = tstamp;
}
/*
 * Returns: 0 - if the new time stamp is greater (more recent) than the
 * previous timestamp of the configuration database.
 * -1 if the "new" timestamp is older.
 */
static int verifyTimestampNewer(time_t newtime) 
{
    if (myStatus->configTimestamp > newtime) {
	clulog(LOG_ERR, "verifyTimestampNewer: new=%ld, prev=%ld.\n",
		newtime, myStatus->configTimestamp);
	return(-1);
    }
    return(0);
}
/*
 * Called when a partner node is determined to be up, its purpose is to check 
 * whether the partner has updated the timestamp of the configuration
 * database as a means of notifying other cluster members
 * to re-read the configuration information.
 */
static void configChangeCheck(time_t configDBTimestamp)
{
    if (configDBTimestamp) {
	if (partnerConfigDBTimestamp != configDBTimestamp) {
	    if (configDBTimestamp < partnerConfigDBTimestamp) {
                   clulog(LOG_ERR, "configChangeCheck: config time regression!\n");
	    }
	    else {
	        if (partnerConfigDBTimestamp) {
                    clulog(LOG_DEBUG, "configChangeCheck: update config params.\n");
		    notifyLocalDaemons();
		}
	    }
	    partnerConfigDBTimestamp = configDBTimestamp;
	}
    }
}

static pid_t
get_daemons_pid(char *prog)
{
    pid_t        retval;
    FILE         *fp;
    char         filename[PATH_MAX];
    char         *cmd;
    int          ret;
    struct  stat st;

    retval = -1;

    /*
     *  Now see if there is a pidfile associated with this cmd in /var/run
     */
    fp = NULL;
    memset(filename, 0, PATH_MAX);

    cmd = basename(prog);
    sprintf(filename, "/var/run/%s.pid", cmd);

    ret = stat(filename, &st);
    if ((ret < 0) || (!st.st_size)) {
	return (-1);
    }

    /*
     * Read the pid from the file.
     */
    fp = fopen(filename, "r");
    if (fp == NULL) { 
	return(-1);
    }
    if (fscanf(fp, "%d\n", &retval) != 1) {
	return(-1);
    }
    return (retval);
}

/*
 * Local variables:
 *  c-basic-offset: 8
 *  c-indent-level: 8
 *  tab-width: 8
 * End:
 */
