/*
  Copyright Mission Critical Linux, 2000

  Kimberlite is free software; you can redistribute it and/or modify it
  under the terms of the GNU General Public License as published by the
  Free Software Foundation; either version 2, or (at your option) any
  later version.

  Kimberlite is distributed in the hope that it will be useful, but
  WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  General Public License for more details.

  You should have received a copy of the GNU General Public License
  along with Kimberlite; see the file COPYING.  If not, write to the
  Free Software Foundation, Inc.,  675 Mass Ave, Cambridge, 
  MA 02139, USA.
 *
 * Author: Gregory P. Myrdal <Myrdal@MissionCriticalLinux.Com>
 *
 * svcmgr.c
 *
 *
 * This is the main module for the Service Manager daemon.  Service
 * Manager specfic functions and functionality exist in this file.
 * Supporting files (svcmgr_engine.c, svcmgr_action.c) contain code
 * for the operation of the Service Manager.
 */

/*
 * Version string that is filled in by CVS
 */
static const char *version __attribute__ ((unused)) = "$Revision: 1.27 $";

/*
 * System includes
 */
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <getopt.h>
#include <signal.h>
#include <string.h>
#include <errno.h>
#include <sys/types.h>
#include <sys/param.h>
#include <sys/wait.h>
#include <sys/syslog.h>

/*
 * Cluster includes
 */
#include <clusterdefs.h>
#include <parseconf.h>
#include <clu_lock.h>
#include <msgsvc.h>
#include <clucfg.h>
#include <svcmgr.h>
#include <logger.h>
#include "svcmgr_proto.h"

/*
 * Global variables
 */
int myNodeID;				// my node ID
char *myNodeName=NULL;			// my node name
int myNodeState=NODE_UNINITIALIZED;	// my node state
msg_handle_t listen_fd=-1;		// listen file descriptor
char nodeStates[MAX_NODES];		// svcmgr known node states

/*
 * Forward define of local functions
 */
static int closeFDs(void);
static int initServices(void);
static int initNodeStates(void);
static int initShellScriptLocking(void);
static int printNodeStatusAll(void);
static int printNodeStatus(int nodeID);
static int printServiceStatusAll(void);
static int printServiceStatus(int svcID);
static int setLogLevel(void);

/*
 * Externally defined functions
 */
extern void daemon_init(char *prog);
extern int check_process_running(char *prog, pid_t *pid);
extern int getServiceStatus(int svcID, ServiceBlock *svcStatus);
extern int initServiceSubsys(void);

/*
 * Externally defined variables
 */
extern int lockCnt;

/***************************************************************************
 ***************************************************************************
 *
 * Functions
 *
 ***************************************************************************
 ***************************************************************************/

/*
 * printUsage
 *
 * Print out usage string to stdout.
 */
static void
printUsage(char *progName)
{
	fprintf(stdout, "%s [-h] [-d] [-s]\n", progName);
}

/***************************************************************************
 * Service Manager signal handlers
 ***************************************************************************/

/*
 * svcmgr_SIGTERM_handler
 *
 * Request to exit, act as if we got a local NODE_DOWN and stop
 * services, exit and notify the quorumd.
 */
void
svcmgr_SIGTERM_handler(void)
{

	if (nodeChange(myNodeID, NODE_DOWN) != SUCCESS)
	  {
	    clulog(LOG_ERR, 
"Cannot respond correctly to node state down, not exiting\n");
	  }

}

/*
 * svcmgr_SIGUSR1_handler
 *
 * Print out status
 */
void
svcmgr_SIGUSR1_handler(void)
{
	printNodeStatusAll();
	printServiceStatusAll();
}

/*
 * svcmgr_SIGHUP_handler
 *
 * Re-read the cluster configuration file.
 */
void
svcmgr_SIGHUP_handler(void)
{
	reReadDatabase(CLU_CONFIG_FILE);
	setLogLevel();		// reset the log level if it changed
}

/*
 * svcmgr_SIGSEGV_handler
 *
 */
void
svcmgr_SIGSEGV_handler(void)
{
	if (lockCnt > 0)
	    clu_un_lock();
}

/*
 * svcmgr_SIGBUS_handler
 *
 */
void
svcmgr_SIGBUS_handler(void)
{
	if (lockCnt > 0)
	    clu_un_lock();
}

/*
 * svcmgr_SIGCHLD_handler
 *
 * Wait on children that have exited to remove them from the defunct
 * state.
 */
void
svcmgr_SIGCHLD_handler(void)
{
	int pid;
	int status;

	pid=wait(&status);

	if ((pid != -1) && (pid != 0))
	  {
	    clulog(LOG_DEBUG, "Child %d exited with status %d\n", pid, status);
	  }
}

/***************************************************************************
 * Service Manager exit functions
 ***************************************************************************/

/*
 * svcmgrExit
 *
 * Tell the quorum daemon we are leaving and exit.  This function should only 
 * be called when all of the services we are serving have successfully stopped.
 */
void
svcmgrExit(int status)
{
	msg_handle_t send_fd=-1;
	static DiskMessageSt sendMsgBuf;
	ServiceBlock svcStatus;
	char *svcName;
	int svcID;
	int exitOK=YES;
	pid_t pid;
	int retVal;
	int loop=0;
	int weOwnServices=YES;
	int try=0;
	int max_try_count=10;

	/*
	 * If the Service Manager has children around, one of these
	 * might be a start service.  The Service Manager can not leave
	 * unless all services are cleanly stopped, thus, we must wait
	 * for all service stops and do not want any wait and starts
	 * lingering around.
	 */
	while (weOwnServices == YES)
	  {
	    /*
	     * Wait for all children to exit.
	     */
	    while (1)
	      {
	        if (loop == 0)
	          {
	            clulog(LOG_INFO,
"Service manager is checking for child processes before it will exit\n");
	          }
	        else
	          {
		    loop=1;
	            clulog(LOG_INFO,
"Service manager is waiting on a child before it will exit\n");
	          }

	          pid=wait(&retVal);

	          if (pid == -1)
	              break;		// no more processes to wait on

	          clulog(LOG_DEBUG, "Child %d returned with status %d\n", 
	                 pid, retVal);

	          /*
	           * If a child returned an error we do not know what state
	           * the services are in.  Mark that one returned error so
	           * we can check service states when all children have checked
	           * in.
	           */
	          if (WIFEXITED(retVal) == 0)	// child had an error
	            {
	              clulog(LOG_ERR,
"Cannot exit, child process %d returned error %d", pid, retVal);
	              exitOK=NO;
	            }
	        }

	      /*
	       * When all children have exited, check the state of all services.
	       * If there are no services owned by us we can exit.  If we own
	       * any service we can not exit. Stop the service(s) before 
	       * exiting.
	       */
	      weOwnServices = NO;

	      for (svcID=MIN_SERVICE; svcID < MAX_SERVICES; svcID++)
	        {
	          if ((retVal=serviceExists(svcID)) != YES)
	            {
	              continue;		// not in database, go to next service
	            }

	          getSvcName(svcID, &svcName);

	          if (lockAndReqServiceStatus(svcID, &svcStatus) != SUCCESS)
	            {
	              clulog(LOG_ERR, "Cannot get service status for %s\n", 
	                     svcName);
	              exitOK = NO;
	            }   

	          if (svcStatus.owner == myNodeID)
	            {
	              if (svcStatus.state == SVC_ERROR)
	                {
	                   clulog(LOG_EMERG,
"Cannot stop cluster software: service %s is in %s state owned by %s.\n",
	                          svcName, 
	                          serviceStateStrings[svcStatus.state],
	                          myNodeName);
	                   clulog(LOG_EMERG,
"The cluster software can not stop because it might have resources still
configured on this system (i.e. filesystem mounts).  A halt of this member
is suggested to free any resources it might own.\n");
	                   return;
	                }

	              clulog(LOG_WARNING,
"Cannot exit Service Manager yet, service %s is owned by %s in %s state\n",
	                     svcName, myNodeName, 
	                     serviceStateStrings[svcStatus.state]);
	              if (reqStopService(svcStatus.id, YES) != SUCCESS)
	                  exitOK = NO;
	              else
	                  weOwnServices= YES;
	            }
	        }
	    }

	if (exitOK != YES)
	  {
	    clulog(LOG_EMERG, 
"Service Manager can not exit.  Service resources may still be configured on
this cluster member.  A reboot of this cluster member is suggested.\n");
	    return;
	  }

	/*
	 * Tell the quorum daemon that we are leaving
	 */
	try = 0;
	exitOK = NO;
	while (exitOK == NO)
	  {
	    if (try >= max_try_count)
	      {
	        clulog(LOG_ERR, 
	               "Max retries reached, cannot contact quorum daemon.\n");
	        if (myNodeName != NULL)
	            free(myNodeName);
	        closeFDs();		// close any other open file descriptors
	        clulog(LOG_NOTICE, "Service Manager exiting\n");
	        exit(1);
	      }

	    try=try++;
	    if ((send_fd=msg_open(PROCID_QUORUMD, myNodeID)) < 0)
	      {
	        clulog(LOG_WARNING, 
	               "msg_open failed to quorum daemon, retrying...\n");
	        sleep(1);
	        continue;
	      }
	      exitOK = YES;
	  }

	sendMsgBuf.hdr.magic = GENERIC_HDR_MAGIC;
	sendMsgBuf.hdr.command = DISK_SM_EXITING;
	sendMsgBuf.hdr.length = DISK_MESSAGE_SIZE;

	exitOK = NO;
	try = 0;
	while (exitOK == NO)
	  {
	    if (try >= max_try_count)
	      {
	        clulog(LOG_ERR, 
	               "Max retries reached, cannot send to quorum daemon.\n");

	        if (send_fd >= 0)
	            msg_close(send_fd);

	        if (myNodeName != NULL)
	            free(myNodeName);
	        closeFDs();		// close any other open file descriptors
	        clulog(LOG_NOTICE, "Service Manager exiting\n");
	        exit(1);
	      }

	    clulog(LOG_DEBUG, 
	           "Telling the quorum daemon that we are leaving\n");

	    try=try++;
	    if ((retVal=msg_send(send_fd, &sendMsgBuf, DISK_MESSAGE_SIZE))
	                 != DISK_MESSAGE_SIZE)
	      {
	        clulog(LOG_WARNING, 
"Cannot send Service Manager exiting message, err=%d, retrying ...\n", retVal);
	        sleep(1);
	        continue;
	      }                                        
	      exitOK = YES;
	  }                                        

	if (send_fd >= 0)
	    msg_close(send_fd);
	if (myNodeName != NULL)
	    free(myNodeName);
	closeFDs();		// close any other open file descriptors
	clulog(LOG_NOTICE, "Service Manager exiting\n");
	exit(status);
}

/***************************************************************************
 * Service Manager communication functions
 ***************************************************************************/

/*
 * sendImAliveMsg
 *
 * Send DISK_SM_ALIVE message to the quorumd.  We do this at initialization
 * to tell the quorumd that we are alive and ready to start accepting host
 * state changes.  We also check in periodically to let the quorumd know
 * that we are still alive and kicking.
 */
int
sendImAliveMsg(void)
{
	int retVal;
	static DiskMessageSt sendMsgBuf;
	msg_handle_t send_fd=-1;

	clulog(LOG_DEBUG,
"Sending Service Manager alive message to quorum daemon\n");

	if ((send_fd=msg_open(PROCID_QUORUMD, myNodeID)) < 0)
	  {
	    clulog(LOG_WARNING, "Cannot open connection to quorum daemon\n");
	    return(FAIL);	// this is ok, caller should loop
	  }

	sendMsgBuf.hdr.magic = GENERIC_HDR_MAGIC;
	sendMsgBuf.hdr.command = DISK_SM_ALIVE;
	sendMsgBuf.hdr.length = DISK_MESSAGE_SIZE;
	sendMsgBuf.data.daemonPid = getpid();
	if ((retVal=msg_send(send_fd, &sendMsgBuf, DISK_MESSAGE_SIZE))
	             != DISK_MESSAGE_SIZE)
	  {
	    clulog(LOG_ERR, 
"Cannot send Service Manager alive message to quorumd; err=%d\n", retVal);
	    if (send_fd >=0)
	        msg_close(send_fd);
	    return(FAIL);
	  }                                        

	if (send_fd >=0)
	    msg_close(send_fd);
	return(SUCCESS);
}

/*
 * closeFDs
 *
 * Close all of the Service Manager open file descriptors and connections.
 */
static int
closeFDs(void)
{
	clulog_close();			// close our connection to the loggerd
	if (listen_fd >= 0)
	    msg_close(listen_fd);	// close our listen file descriptor

	return(SUCCESS);
}

/***************************************************************************
 * Node State functions
 ***************************************************************************/

/*
 * initNodeStates
 *
 * Initialize the global node state array.  This array is updated when
 * we received node state change events in nodeChange().
 */
static int
initNodeStates(void)
{
	int i;
	
	for (i = 0; i < MAX_NODES; i++)
	  {
	    nodeStates[i] = NODE_UNINITIALIZED;
	  }

	return(SUCCESS);
}

/*
 * svcmgrGetNodeState
 *
 * Given a node ID, return the current that is in our global node
 * state array.
 */
int
svcmgrGetNodeState(int nodeID)
{
	if (nodeID == NODE_ID_NONE)
            return(NODE_UNINITIALIZED);  

	return(nodeStates[nodeID]);
}

/*
 * printNodeStatusAll
 *
 * Print to the logger the status of all nodes as known by the Service
 * Manager.
 */
static int
printNodeStatusAll(void)
{
	int nodeID;
	char *nodeName=(char *)NULL;

	clulog(LOG_NOTICE, "%-20s %-10s\n", "Node", "Status");
	clulog(LOG_NOTICE, "%-20s %-10s\n", 
	       "--------------------", "----------");

	printf("\t%-20s %-10s\n", "Node", "Status");
	printf("\t%-20s %-10s\n", 
	       "--------------------", "----------");

	nodeID=MIN_NODE;
        while (nodeID < MAX_NODES)
	  {
            if (getNodeName(nodeID, &nodeName) != SUCCESS)
              {
	        nodeID++;
                continue;
              }   

	    printNodeStatus(nodeID);
	    nodeID++;
	  }

	return(SUCCESS);
}

/*
 * printNodeStatus
 *
 * Print to the logger the status of a node as known by the Service
 * Manager.
 */
static int
printNodeStatus(int nodeID)
{
	char *nodeName;
	int nodeState;

        getNodeName(nodeID, &nodeName);

	if ((nodeState=svcmgrGetNodeState(nodeID)) == FAIL)
          {
            clulog_and_print(LOG_NOTICE, "Cannot get node state for node %s\n", 
	                nodeName);
            return(FAIL);
          }       


	clulog(LOG_NOTICE, "%-20s %-10s\n", 
	       nodeName, nodeStateStrings[nodeState]);
	printf("\t%-20s %-10s\n", 
	       nodeName, nodeStateStrings[nodeState]);

	return(SUCCESS);
}

/***************************************************************************
 * Service state functions
 ***************************************************************************/

/*
 * initServices
 *
 * Initialize all service that we own to a know state.  This is 
 * called when we first boot to clean up any service state that
 * may have been left behind when we went down (crashed, panic'd).
 */
static int
initServices(void)
{
	int svcID;
	ServiceBlock svcStatus;
	char *svcName;
	char errMsg[MAX_ERROR_BUFLEN];

	clulog(LOG_INFO, "Initializing services\n");

	for (svcID=MIN_SERVICE; svcID < MAX_SERVICES; svcID++)
	  {
	    if (serviceExists(svcID) != YES)
	        continue;		// not in database, go to next service

	    getSvcName(svcID, &svcName);

	    /*
	     * If service is not on the shared service information disk, add
	     * it.  We call getServiceStatus() here because an error during
	     * init is ok and we do not want to alarm anyone.
	     */
	    clu_lock();
	    if (getServiceStatus(svcID, &svcStatus) != 0)
	      {
	        clulog(LOG_DEBUG, 
"Cannot get service status for %s (this is OK)\n", svcName);
	        if (initServiceStatus(svcID, errMsg) != SUCCESS)
	          {
	            clulog(LOG_ERR, 
"Cannot add service status information to shared disk for service %s", svcName);
	          }
	        clu_un_lock();
	        continue;
	      }

	    /*
	     * If the owner of the service is not defined we do not have
	     * to initialize it (we should start it however, but that is
	     * not done here).  Note: if we ever hit this it may be an
	     * error as a service in any state except SVC_STOPPED and
	     * SVC_DISABLED should have an owner.
	     */
	    if ((svcStatus.owner == NODE_ID_NONE) &&
	        (svcStatus.state != SVC_STOPPED) &&
	        (svcStatus.state != SVC_DISABLED))
	      {
	        if (svcStatus.state != SVC_ERROR)
	          {
	            svcStatus.state = SVC_STOPPED;
	            clulog(LOG_WARNING, 
"Service %s is not owned by anyone but is set to %s, setting it to stopped\n",
	                   svcName, serviceStateStrings[svcStatus.state]);
	            reqServiceStatusChange(&svcStatus);
	          }
	        clu_un_lock();
	        continue;
	      }
	    clu_un_lock();
	  }

	/*
	 * We stop all services to clean up any state in the case
	 * that this system came down without gracefully stopping services.
	 *
	 * If we own the service we should not be running it because
	 * we are just starting up.  Stop the service.  This can happen
	 * if the local node ungracefully comes down and no other node
	 * picks up the service.  This can also happen if someone kills
	 * the svcmgr and restarts it.  It would be better for the svcmgr
	 * restart case for us to know that the service is running and
	 * just let it continue to run, but we do not know if its really
	 * running or if this is old state information.
	 */
	if (stopAllServices() != SUCCESS)
	  {
	    clulog(LOG_WARNING, "Cannot stop services during initialization\n");
	    return(FAIL);
	  }

	clulog(LOG_INFO, "All service stop(s) have finished\n");

	return(SUCCESS);
}

/*
 * initShellScriptLocking
 *
 * Initialize any locks known to the shell scripts before starting any
 * services.
 */
static int
initShellScriptLocking(void)
{
	int pid;
	int status;

	switch (pid=fork())
	  {
	  case -1:			// error
	    sprintf(errbuf, "fork failed: %s", sys_errlist[errno]);
	    clulog_and_print(LOG_ERR, "%s", errbuf);
	    return(FAIL);
	    break;

	  case 0:			// child
	    break;

	  default:			// parent
	      wait(&status);
	      return(status);
	    break;
	  }
	
	closeChildFDs();		// close any file descriptors

	clulog_and_print(LOG_INFO, "Initializing shell script locking");

	clulog_and_print(LOG_DEBUG, "Exec'ing script '%s'\n", 
	                 SVC_SHELL_LOCK_INIT);

	if (execv(SVC_SHELL_LOCK_INIT, (char **)NULL) != 0) 
	  {
	    sprintf(errbuf, "Cannot exec script '%s': %s\n", 
	            SVC_SHELL_LOCK_INIT, sys_errlist[errno]);
	    clulog_and_print(LOG_ERR, "%s", errbuf);
	    return(FAIL);
	  }

	return(SUCCESS);		// should never get here
}

/*
 * printServiceStatusAll
 *
 * Print the status of all services to the cluster log.
 */
static int
printServiceStatusAll(void)
{
	int svcID;
	char *svcName;

	clulog(LOG_NOTICE, " \n");
	clulog(LOG_NOTICE, "%-20s %-10s %-20s\n", 
	        "Service", "Status", "Owner");
	clulog(LOG_NOTICE, "%-20s %-10s %-20s\n", 
	       "--------------------", "----------", "----------------------");

	printf("\t%-20s %-10s %-20s\n", 
	        "Service", "Status", "Owner");
	printf("\t%-20s %-10s %-20s\n", 
	       "--------------------", "----------", "----------------------");

	svcID=MIN_SERVICE;
        while (svcID < (MAX_SERVICES-1))
	  {

	    if (serviceExists(svcID) != YES)
	      {
	        svcID++;
	        continue;
	      }

            getSvcName(svcID, &svcName);
	    printServiceStatus(svcID);
	    svcID++;
	  }

	return(SUCCESS);
}

/*
 * printServiceStatus
 *
 * Print the status of a service to the cluster log.
 */
static int
printServiceStatus(int svcID)
{
	ServiceBlock svcStatus;
	char *svcName;
	char *svcOwnerName=(char *)NULL;

        getSvcName(svcID, &svcName);

	if (lockAndReqServiceStatus(svcID, &svcStatus) != SUCCESS)
          {
            return(FAIL);
          }       


	if (svcStatus.owner == NODE_ID_NONE)
	  svcOwnerName="None";
	else
	  getNodeName(svcStatus.owner, &svcOwnerName);

	clulog(LOG_NOTICE, "%-20s %-10s %-20s\n", svcName,
	       serviceStateStrings[svcStatus.state],
	       svcOwnerName);

	printf("\t%-20s %-10s %-20s\n", svcName,
	       serviceStateStrings[svcStatus.state],
	       svcOwnerName);

	return(SUCCESS);
}

/***************************************************************************
 * Service Manager system functions
 ***************************************************************************/

/*
 * rebootSystem
 *
 * Reboot the system.
 */
void
rebootSystem(char *reason)
{
	clulog(LOG_EMERG, "Service Manager rebooting system\n");
	sync(); sync(); sync();
        execl (SHUTDOWN_CMD, SHUTDOWN_CMD,"-r" ,"now", reason, NULL);
}

/*
 * systemShuttingDown
 *
 * If the system is shutting down return YES, else return NO.
 */
int
systemShuttingDown(void)
{
	return(YES);
}

/***************************************************************************
 * Service Manager configuration functions
 ***************************************************************************/

/*
 * setLogLevel
 *
 * Based on the log level setting in the cluster configuration file, set
 * the log level.  If its not found, use the defined default.
 */
static int
setLogLevel(void)
{
	int oldLogLevel;
	int newLogLevel;

	/*
	 * Get our logging level
	 */
	if (getSvcMgrLogLevel(&newLogLevel) == FAIL)
	  {
	    clulog(LOG_ERR, 
"Cannot get log level for Service Manager from configuration database\n");
	    newLogLevel=FAIL;
	  }

	if ((oldLogLevel=clu_get_loglevel()) == -1)
	  {
	    clulog(LOG_ERR, "Cannot get current log level from the logger\n");
	    oldLogLevel=FAIL;
	  }
	
	if ((oldLogLevel != FAIL) &&
	    (newLogLevel != FAIL) &&
	    (oldLogLevel != newLogLevel))
	  {
	    clulog(LOG_NOTICE, 
"Setting Service Manager logging level to %d\n", newLogLevel);

	    if (clu_set_loglevel(newLogLevel) == -1)
	      {
	        clulog(LOG_ERR, 
"Cannot set Service Manager logging level to %d\n", newLogLevel);
	      }
	  }

	return(SUCCESS);
}

/***************************************************************************
 ***************************************************************************
 *
 * Main
 *
 ***************************************************************************
 ***************************************************************************/
int
main(int argc, char **argv)
{
	CluCfg *cfg;
	static DiskMessageSt rcvMsgBuf;
	int retVal;
	int auth=0;
	char buf[128];			// buffer for perror output
	sigset_t set;
	msg_handle_t accept_fd=-1;
	int rcv_timeout;		// time out for msg_receive
	int accept_timeout;		// time out for msg_accept
	int become_daemon=1;		// default to running as a daemon
	char opt;
	uid_t uid;
	pid_t pid;

	uid=getuid();
	if (uid)
	  {
	    fprintf(stdout, "%s should only be run as user root\n",
	            argv[0]);
	    exit(1);
	  }

	/*
	 * Check to see if we are running from the command line
	 * with an argument passed to us.  If this is the case
	 * we do not want to become a daemon.  We will perform
	 * the request and exit.
	 */
	while ((opt = getopt(argc, argv, "l:dsh")) != -1)
	  {
	    switch (opt)
	      {
	      /*
	       * A request has been made to stop all services and
	       * the svcmgr.
	       */
              case 's':			// stop services
	        if ((check_process_running(argv[0], &pid)) &&
	            (pid > 0))
	          {
	            /*
	             * If there is a svcmgr already running
	             * send it a SIGTERM signal to stop all
	             * services and exit.
	             */
	            if (kill(pid, SIGTERM) != 0)
	              {
	                printf("Cannot send SIGTERM to svcmgr process: %s\n",
	                        strerror(errno));
	                exit(1);
	              }
	          }
	        else
	          {
	            clulog_and_print(LOG_ERR, 
"Cannot stop Service Manager as it is not running\n");
	            exit(1);
	          }
	        exit(0);

              case 'd':			// do not run as a daemon
	        become_daemon=0;
	        break;

              case 'h':			// command line help
                printUsage(argv[0]);
	        exit(0);

              default:			// unknown option
                printUsage(argv[0]);     
	        exit(0);
	      }
	  }

	if (become_daemon)
	    daemon_init(argv[0]);		// become a daemon

	setLogLevel();

	clulog(LOG_INFO, "Service Manager starting\n");

	/*
	 * daemon_init() blocks most signals, so we need to add the
	 * ones the Service Manager is interested in.
	 */
	sigemptyset(&set);
	sigaddset(&set, SIGTERM);
	sigaddset(&set, SIGHUP);
	sigaddset(&set, SIGUSR1);
	sigaddset(&set, SIGCHLD);
	sigprocmask(SIG_UNBLOCK, &set, NULL);

	/*
	 * Set up a signal handler exit on receipt of SIGTERM.
	 */
	(void) signal(SIGTERM, (void (*)(int))svcmgr_SIGTERM_handler);

	/*
	 * Set up a signal handler to re-read the config file when SIGHUP
	 * is received.
	 */
	(void) signal(SIGHUP, (void (*)(int))svcmgr_SIGHUP_handler);

	/*
	 * Register a signal handler to print out the Service Manager
	 * state when a SIGUSR1 is sent to it.
	 */
	(void) signal(SIGUSR1, (void (*)(int))svcmgr_SIGUSR1_handler);

	/*
	 * Register a signal handler so our children do not get hung in 
	 * defunct state when they exit.
	 */
	(void) signal(SIGCHLD,  (void (*)(int))svcmgr_SIGCHLD_handler);

	/*
	 * Get cluster information about what node I am
	 */
	cfg = get_clu_cfg(CLU_CONFIG_FILE);
	if (cfg == NULL) {
	  sprintf(buf, "error in get_clu_cfg: %s\n", strerror(errno));
	  clulog(LOG_ERR, buf);
	  svcmgrExit(1);
	}          

	myNodeID = cfg->lid;
	free(cfg);
	cfg = NULL;
	/*
	 * getNodeName send us a pointer to a static chunk of memory.
	 * Since this is a global we do not another call to this function
	 * to overwrite it, create our own copy.
	 */
	getNodeName(myNodeID, &myNodeName);
	myNodeName=strdup(myNodeName);

	/*
	 * Initialize our global array of cluster node states
	 */
	initNodeStates();

	/*
	 * Initialize the shell script locking subsystem
	 */
	if ((retVal=initShellScriptLocking()) != SUCCESS)
	  {
	    clulog(LOG_ERR, 
	      "Cannot initialize shell script locking subsystem; error= %d\n",
	      retVal);
	    svcmgrExit(1);
	  }

	/*
	 * Set up the message service
	 */
	if ((listen_fd = msg_listen(PROCID_SVCMGR)) < 0)
	  {
	    clulog(LOG_ERR, "Cannot set up listen file descriptor: %s\n",
	               strerror(errno));
	    svcmgrExit(1);
	  }

	accept_timeout=MSG_TIMEOUT_INIT;
	rcv_timeout=MSG_TIMEOUT_NORMAL;

	while (1)
	  {
	    accept_fd = msg_accept_timeout(listen_fd, accept_timeout);

	    if (accept_fd == 0)		// Timeout
	      {
	        /*
	         * Tell quorumd that we are alive.  After the first alive 
	         * message the quorumd will respond with the node states.
	         */
	        if (sendImAliveMsg() == SUCCESS)
	            accept_timeout=MSG_TIMEOUT_NORMAL;

	        continue;
	      }

	    if (accept_fd < 0)		// Error
	      {
	        clulog(LOG_ERR, "msg_accept() returned error: %s\n",
	               strerror(errno));
	        continue;
	      }

	    /*
	     * If we got here someone has established a connection with
	     * us and there is data to read.
	     */
	    retVal = msg_receive_timeout(accept_fd, &rcvMsgBuf, 
	             DISK_MESSAGE_SIZE, &auth, rcv_timeout);

	    if (accept_fd >=0)
	        msg_close(accept_fd);

	    if (retVal == 0) 		// Timeout
	      {
	         clulog(LOG_WARNING, "Timeout receiving message: %s\n",
	                strerror(errno));
	         continue;
	      }

	    if (retVal < 0) 		// Error
	      {
	        clulog(LOG_ERR, 
"Received error %d from msg_receive_timeout()\n", retVal);
	        continue;
	      }

	    if (retVal > 0) 		// Read retVal bytes
	      {
		if ((rcvMsgBuf.hdr.magic != GENERIC_HDR_MAGIC) ||
		    (retVal != DISK_MESSAGE_SIZE))
	          {
			clulog(LOG_ERR, "bad magic # or incomplete read.\n");
		  }
		else
	          {
		  switch (rcvMsgBuf.hdr.command)
	            {
		      case DISK_NODE_STATECHANGE:
                       if (!auth)
                         {
                           clulog(LOG_CRIT,"auth=%d\n",auth);
                           clulog(LOG_CRIT,
"Received DISK_NODE_STATECHANGE request from unauthorized connection.\n");
                           break;
                         }
			clulog(LOG_DEBUG, 
"Received DISK_NODE_STATECHANGE(%d, %d) request\n",
	                            rcvMsgBuf.data.statusMsg.nodeNumber,
	                            rcvMsgBuf.data.statusMsg.nodeStatus);

	                if ((rcvMsgBuf.data.statusMsg.nodeNumber == myNodeID)
	                  &&(rcvMsgBuf.data.statusMsg.nodeStatus == NODE_UP))
	                  {
	                    clulog(LOG_DEBUG, 
"Received a local HOST_UP, we can now initialize services\n");
	                    myNodeState=NODE_UP;

	                    /*
	                     * Initialize all local services to SVC_STOPPED.
	                     *
	                     * We do this here because if we went down with
	                     * services owned by us we need to reset their
	                     * state in case we are the first cluster node to
	                     * boot.  We can not do this earlier because the
	                     * lock structure needs to initialized by the
	                     * quorum daemon.
	                     */
	                    if (initServices() != SUCCESS)
	                      {
	                        clulog(LOG_ERR, "Cannot initialize services\n");
				svcmgrExit(1);
	                      }
	                  }

	                /*
	                 * Until we get a message from the quorum daemon that
	                 * we are up, ignore all state information from other
	                 * nodes.  If we get a local NODE_DOWN, then we need
	                 * to service that.
	                 */
	                if ((rcvMsgBuf.data.statusMsg.nodeNumber != myNodeID) &&
	                    (myNodeState != NODE_UP))
	                  {
	                    nodeStates[rcvMsgBuf.data.statusMsg.nodeNumber] = 
	                              rcvMsgBuf.data.statusMsg.nodeStatus;
	                    clulog(LOG_DEBUG,
"Skipping message %s from node %d\n",
		      nodeStateStrings[rcvMsgBuf.data.statusMsg.nodeStatus],
	              rcvMsgBuf.data.statusMsg.nodeNumber);
	                    break; 
	                  }

	                /*
	                 * Call the nodeChange() function to determine if
	                 * we should change the state of any of the services
	                 * due to this event.
	                 */
			nodeChange(rcvMsgBuf.data.statusMsg.nodeNumber,
	                           rcvMsgBuf.data.statusMsg.nodeStatus);
	                break;

		      case NODE_UNINITIALIZED:	// ignore
	                break;

	            default:
			clulog(LOG_DEBUG, 
"Received unknown request %d\n", rcvMsgBuf.hdr.command);
	              break;
	            }
	  	 }
	      }
	  }
	if (listen_fd >= 0)
	    msg_close(listen_fd);
}

