static char dqs_execd_rcsid[]="$Id: dqs_execd.c,v 1.2 1998/10/21 14:42:01 green Exp $";

/*----------------------------------------------------
 * dqs_execd.c Tom Green Mon Jan 31 10:42:43 1994
 *
 * Copyright 1993
 *
 * SUPER COMPUTER COMPUTATIONS RESEARCH INSTITUTE
 *            FLORIDA STATE UNIVERSITY
 *
 *
 * SCRI representatives make no claims about the
 * suitability of this software for any purpose.
 * It is provided "as is" without express or
 * implied warranty.
 *
 * $Log: dqs_execd.c,v $
 * Revision 1.2  1998/10/21 14:42:01  green
 * Red Hat Alpha port - ADDRLEN incorporated
 *
 * Revision 1.1.1.1  1998/08/18 14:39:11  green
 * DQS 3.2.0.5 WIP Import
 *
 * Revision 1.1.1.1  1997/04/10 15:10:32  green
 * DQS 3.1.3.4.1 Distribution
 *
 * Revision 3.19  1996/11/20 23:03:42  nrl
 * Several fixes submitted by or as a result of investigations by
 * Ron Lee, Bodo Bechenback, Guntram Wolski and Frank Dwyyer.
 *
 * Revision 3.18  1996/06/27  01:55:51  nrl
 * changes to accomodate osf gcc
 *
 * Revision 3.17  1996/06/25  23:12:05  nrl
 * repaired mailer and output file problems
 *
 * Revision 3.16  1996/03/22  04:20:25  nrl
 * Added error cataloguing number to all routines
 *
 * Revision 3.15  1996/03/12  17:12:03  nrl
 * removed aborts and replaced with an error messaging scheme
 * to send email to the dqs adminsitrator and wait for
 * actions by that administrator
 *
 * Revision 3.14  1996/02/19  19:02:05  nrl
 * added a separate subpriority field, pluys scheduling_flags and
 * job_seq_number to remove the 3.1.2.4 kludges , modified the
 * scheduling algorith once again
 *
 * Revision 3.13  1996/02/07  13:07:59  nrl
 * Added "process leader" and TMP_FILES link capability
 *
 * Revision 3.12  1995/02/26  03:32:21  nrl
 * Added error checling on qmaster checkin
 *
 * Revision 3.11  1995/02/24  14:43:48  nrl
 * added errno toall socket CRITICAL error messages
 *
 * Revision 3.10  1995/02/22  14:29:21  nrl
 * added "FREE" macro to make sure all freed pointers are NULL,
 * replaced all calls to free( ) with FREE.
 *
 * Revision 3.9  1995/02/14  21:41:13  nrl
 * added bulletproofing for NULL tid when doing cleanup of
 * crashed jobs
 *
 * Revision 3.8  1995/02/05  00:51:06  nrl
 * Changed meaning of "-clean" option to mean cleanout all
 * options potentially resettable by qalter. Added interactive
 * prompting.
 *
 * Revision 3.7  1995/01/30  15:21:53  nrl
 * added "tid" verification between execd and qmaster to prevent
 * "ghost" jobs from persisting in visible queue. Changed ERROR messages
 * which were for information only to DEBUG messages.
 *
 * Revision 3.6  1994/11/14  14:09:08  green
 * initialized a couple pointers that were causing CDs under ALPHA and
 * Solaris
 *
 * Revision 3.5  1994/06/15  15:29:15  green
 * support for using DQS trusted host list for dshd
 *
 *      passing of Host_head -n dqs_c_dqs_execd.c
 *      time stamp Host_head on deletion in dqs_c_qconf.c
 *      ck trusted host list in dqs_dshd.c
 *      grab Host_head at startup in dqs_execd.c
 *      rebuild Host_head/Host_hash in dqs_execd_rebuild_host_hash.c
 *      dqs_free_hash in dqs_hash.c
 *      grab new Host_head in dqs_load_avg.c
 *      error log/printing in dsh.c
 *
 * Bug in my syslog code(or certain vendors required nullifying use
 * of syslogd until I can track it down...
 *
 * Revision 3.4  1994/06/12  02:32:39  green
 * removed "dshd" and moved the functionality in the dqs_execd - this
 * will allow for the dqs_execd to set queue/job resources and to
 * reap/report rusage
 *
 * some really strange timing errors in the "dshd" connecting to the
 * stderr port on "dsh" under Linux forced introduction of a sleep(1);
 * this needs to be followed up on.
 *
 * dqs_dshd_service no longer needed
 *
 * allowed SIGCHLD to interrupt io_mask
 *
 * Revision 3.3  1994/06/11  19:20:06  green
 * moved the execl(dshd) out of /etc/inetd.conf and into the dqs_execd.
 * (not nearly as easy as I had thought...)
 *
 * no longer need DQS_DSHD_SERVICE
 *
 * mods were required to dqs_send_receive_list() which need to be
 * propogated to ALL ancillaries...
 *
 * Revision 3.2  1994/05/30  23:56:16  green
 * added necessary defs for DCMD/DSH to def.h
 *
 * added notes on necessary hooks to dqs_execd.c reqd for direct execing
 * of dsh
 *
 * added "#include <setjmp.h>" to h.h
 *
 * Revision 3.1  1994/03/24  19:04:46  green
 * had some static holdovers of DQS_EXECD_SERVICE that in reality should
 * have been conf.dqs_execd_service.
 *
 * Revision 3.0  1994/03/07  04:13:47  green
 * 3.0 freeze
 *
 * Revision 1.11  1994/03/04  14:59:39  green
 * initialized job-start_time
 *
 * Revision 1.10  1994/03/01  22:03:01  green
 * setsockopt on mips-dec-OSF1-1.0 requires s_in.sin_addr.s_addr
 * to be set before a bind.<dqs_execd.c>
 *
 * removed some debug code from dqs_resolve.c
 *
 * Revision 1.9  1994/03/01  19:23:36  green
 * removed "test" from the default dependancy list in Makefile.proto.
 *
 * changed dqs_show_all_cofigurations() to dqs_show_all_configurations()
 * in dqs_execd.c, qmaster.c and dqs_utility.c.
 *
 * fixed the broken comment exclusion feature in dqs_resolve.c.
 *
 * fixed bug in me.default_cell in dqs_setup.c where conf_file
 * default_cell was not updating me.default_cell.
 *
 * getenv(DQS_CELL) mved out of dqs_getme() into dqs_setup()
 * to allow dqs_getme() to be called on forks.
 *
 * remade func.h.
 *
 * pulled garbage out of test.c.
 *
 * added syntax instructions to resolve_file
 *
 * Revision 1.8  1994/03/01  00:05:57  green
 * removed some superflurious code.
 *
 * Revision 1.7  1994/02/25  23:13:19  green
 * added "PID_FILE" to def.h to log process id.
 *
 * modified dqs_execd.c and qmaster.c to log pids to a file
 *
 * added dqs_log_pid() to dqs_utility.c
 *
 * remade func.h
 *
 * forced insertion of "green" into Man_head  -- this needs to be removed
 * and a def in dqs.h if running as non-root.
 *
 * Revision 1.6  1994/02/24  18:11:26  green
 * added dqs_show_all_cofigurations() to dqs_execd.c an qmaster.c
 *
 * modified dqs_execd.c and dqs_load_avg.c to use me.default_cell rather than
 * conf.default_cell
 *
 * previous message about errant pointer reference was incorrect.
 * dqs_sig_handlers.c put back like it was previously
 *
 * there was however a point problem with dusage->master in dqs_setup.c.
 *
 * added dqs_show_all_cofigurations() to dqs_utility.c
 *
 * remade func.h
 *
 * activated the "-cell cell_name" option for dqs_execd in globals.h
 *
 * changed fscanf() to fgets() in qsub.c to fix problems on SV machines
 * and to avoid byte-stuffing problems.
 *
 * Revision 1.5  1994/02/17  17:38:20  green
 * moved dqs_daemonize() out of dqs_setup_dqs_execd() and dqs_setup_qmaster()
 * and into dqs_execd.c and qmaster.c respectively to allow the "-passwd"
 * and "-verify" options to function properly.
 *
 * Revision 1.4  1994/02/17  14:49:03  green
 * added MAX_KLOG_TIME to def.h
 *
 * added some strategic dqs_set_coresize_2_0() as CYAs
 *
 * nuked some dqs_set_coresize_back_normal()
 *
 * dinked with ALRM handlers for "robustness"
 *
 * NOTE: HPUX does not support core limit size --- sad...
 *
 * Revision 1.3  1994/02/17  14:15:00  green
 * yanked the sleep out of dqs_reauth_job() and placed in qmaster,
 * dqs_execd and dqs_exec_job() such that the sleep only occurs at
 * startup.  This minimizes "re-authing" overhead.
 *
 * The AFS reauthing will have to be rewritten anyway since AFS
 * behaves differently on different platforms.(eg: requires reauthing
 * process to be of the same pgroup on "non-AIX" platforms. (UGH! a
 * sheepherder process being required...)
 *
 * Revision 1.2  1994/02/09  19:48:05  green
 * syncing source with docs
 *
 * Revision 1.1.1.1  1994/02/01  17:57:40  green
 * DQS 3.0 ALPHA
 *
 *--------------------------------------------------*/


#define MAINPROGRAM
#include "h.h"
#include "def.h"
#include "dqs.h"
#include "struct.h"
#include "func.h"
#include "globals.h"
#include "dqs_errno.h"

/******************************************************************************/
main(argc, argv,envp)
     int argc;
     char **argv;
     char **envp;
     
{
  
  int on = 1;
  int sfd;
  int sfd0;
  string str;
  struct servent *sp;
  struct sockaddr_in from;
  ADDRLEN fromlen=sizeof(from);
  struct sockaddr_in s_in;
  dqs_list_type listel;
  dqs_list_type *listel_ptr=NULL;   
  dqs_list_type *listel_ptr2=NULL;
  dqs_list_type  *argv_list=(dqs_list_type *) NULL;
  
  DENTER_MAIN((DQS_EVENT,"dqs_execd"));
  dqs_setup(DQS_EXECD,argv[0]);
  argv_list=dqs_args2list(++argv,argc);
  options=dqs_parse_job(argv_list,envp);
  if (!options)
    {
      ERROR((DQS_EVENT,"DQS_ERROR_0218 error: parsing options"));
      DEXITE;
      exit(-1);
    }
  
  dqs_get_passwd_info(options);
  
  if (options->verify)
    {
      dqs_show_all_configurations();
      DEXIT;
      exit(0);
    }
  
  dqs_daemonize();
  
  if (dqs_reauth(options))
    sleep(MAX_KLOG_TIME); /* UGH!  I know - I know! */
  
  dqs_setup_sig_handlers();
  
  /***** set up socket *****/
  
  if(!(sp = getservbyname(conf.dqs_execd_service, "tcp"))) {
    CRITICAL((DQS_EVENT,"DQS_ERROR_0219 %s: Bad service?  %d",conf.dqs_execd_service,errno));
    abort();
  }
  
  if ((sfd0 = socket(AF_INET, SOCK_STREAM, 0)) < 0) {
    CRITICAL((DQS_EVENT,"DQS_ERROR_0220 socket creation error %d ",errno));
    abort();
  }
  
  if(setsockopt(sfd0, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) ){
    CRITICAL((DQS_EVENT,"DQS_ERROR_0221 socket option error %d ",errno));
    abort();
  }
  
  s_in.sin_family =  AF_INET;
  s_in.sin_addr.s_addr=htonl(INADDR_ANY);
  s_in.sin_port = sp->s_port;
  if (bind(sfd0,(struct sockaddr *) &s_in, sizeof(s_in))) {
    CRITICAL((DQS_EVENT,"DQS_ERROR_0222 bind failure  %d \n check for duplicate port numbers in /etc/services",errno));
    abort();
  }
  
  dqs_log_pid();
  
  /* at this point we are sure we are the only dqs_execd */
  /* first we have to report any reaped children that might exist */
  
  sigprocmask(SIG_SETMASK,&io_mask,&omask);
  while (dqs_report_rusage()) 
    {
      ERROR((DQS_EVENT,"DQS_ERROR_0223 Error reporting reaped children"));
      sleep(dqs_rand(MIN_BACKOFF_TIME,MAX_BACKOFF_TIME));
    }
  
  /* now we have to check in with the qmaster */
  
  bzero((char *)&listel,sizeof(listel));
  listel.type=me.who;
  listel.int0=STARTING_UP;
  listel.str0=dqs_string_insert(NULL,me.qualified_hostname);
  while (TRUE) {
    DPRINTF((DQS_EVENT,"*****Checking In With qmaster*****"));
    sleep(dqs_rand(MIN_BACKOFF_TIME,MAX_BACKOFF_TIME));
    alarm(0);
    listel.int1=dqs_get_loadavg();
    if ((sfd=dqs_send_list(me.default_cell,conf.qmaster_service,sfd,&listel))<0)
      {
	ERROR((DQS_EVENT,"DQS_ERROR_0224 error: unable to check in with qmaster"));
	continue;
      }
    if (dqs_get_list(sfd,&listel_ptr2))
      {
	ERROR((DQS_EVENT,"DQS_ERROR_0225 error: unable get checkin list back from qmaster"));
	continue;
      }
    dqs_close_sfd(sfd);
    break;
  }
  if(listel_ptr2->status==DQS_NAK){
    ERROR((DQS_EVENT,"DQS_ERROR_0226 Cannot Checkin with Qmaster\n %s",listel_ptr2->str0));
    abort();
  }      
  
  FREE(listel.str0);
  listel_ptr=dqs_free_list(listel_ptr);
  
  listel_ptr2->chain=dqs_execd_rebuild_host_hash(listel_ptr2->chain);
  listel_ptr2=dqs_free_list(listel_ptr2);
  
  /* since sleep() was called we need to re-install the           */
  /* the SIGALRM handler - well, at least on some systems we do.  */
  /* and yes - i know i shouldn't be mixing sleep() and sigaction */
  
  sigalrm_vec.sa_handler=dqs_alarmclock; 
  sigfillset(&sigalrm_vec.sa_mask);
  sigalrm_vec.sa_flags=0;
  sigaction(SIGALRM,&sigalrm_vec,&sigalrm_ovec);
  
  /* at this point we can start listening for work */
  listen(sfd0,8);
  
  /***** MAIN LOOP *****/
  while (TRUE) 
    {
      DPRINTF((DQS_EVENT,"--------------------------------------"));
      dqs_reauth(options);
      dqs_ck_to_do_list();
      sigprocmask(SIG_SETMASK,&default_mask,&omask);
      if (shut_me_down)
	dqs_shutdown();
      
      if (jobs_to_start)
	dqs_start_jobs();
      
      sigprocmask(SIG_SETMASK,&io_mask,&omask);
      if (dead_children)
	dqs_reap_children();
      
      sigprocmask(SIG_SETMASK,&io_mask,&omask);
      if (Rusage_head) 
	dqs_report_rusage();
      
      dqs_next_alarm(); 
      
      sigprocmask(SIG_SETMASK,&default_mask,&omask);
      if ((sfd = accept(sfd0,(struct sockaddr *) &from, &fromlen)) < 0) 
	{
	  continue;
	}
      else {
	sigprocmask(SIG_SETMASK,&io_mask,&omask);
	dqs_read_socket(sfd);
	sigprocmask(SIG_SETMASK,&default_mask,&omask);
      }
      
    }
}

/******************************************************************************/
int dqs_next_alarm()
     
     /*
       sevearl responsibilities here:
       a) next time to send in load
       b) next time to re-authenticate
     */
     
{
  
  u_long now;
  static u_long then=0;
  
  DENTER((DQS_EVENT,"dqs_next_alarm"));
  
  now=dqs_get_gmt();
  if  (((now-then) > conf.load_log_time) || ((then-now) > conf.load_log_time))
    {
      then = now;
      alarm(ALARMS);
      dqs_send_load();
      alarm(conf.load_log_time);
    }
  else
    alarm(conf.load_log_time); 
  
  DEXIT;
  return(0);
  
}

/******************************************************************************/
int dqs_read_socket(sfd)
     int sfd;
     
{
  
  int           i;
  dqs_list_type listel;
  dqs_list_type *listel_ptr=NULL;
  
  DENTER((DQS_EVENT,"dqs_read_socket"));
  
  DPRINTF((DQS_EVENT,"*************accepted new connection***********"));
  
  if (dqs_get_list(sfd,&listel_ptr))
    {
      DPRINTF((DQS_EVENT,"lost connection - unable to obtain list"));
      DEXITE;
      return(-1);
    }
  
  switch(listel_ptr->who)
    {
      /*------------------------------------------------------*/
    case QMASTER:
      if (dqs_q_master(sfd))
	{
	  bzero((char *)&listel,sizeof(listel));
	  listel.status=DQS_NAK;
	  listel.str0=dqs_string_insert(NULL,
					"error: you are not a DQS trusted host");
	  (void) dqs_send_list(NULL,NULL,sfd,&listel);
	  FREE(listel.str0);
	  dqs_close_sfd(sfd);
	  break;
	}       
      dqs_c_qmaster(sfd,&listel_ptr);
      DEXIT;
      return(0);
      
      /*------------------------------------------------------*/
    case DSH:
      DPRINTF((DQS_EVENT,"CASE DSH: sfd=%d",sfd));
      dqs_dshd(sfd);
      /* do NOT call dqs_close_sfd(sfd) HERE! */
      SFD=9999;
      close(sfd);
      alarm(0);
      DEXIT;
      return(0);
      
      /*------------------------------------------------------*/
    default:
      DPRINTF((DQS_EVENT,"What kind of friggin request was that!?"));
      dqs_close_sfd(sfd);
      DEXIT;
      return(-1);
      
    }
  
  DEXITE;
  return(-1);
}

/************************************************************************************/
int dqs_report_rusage()
     
{
  
  int              sfd;
  int              status;
  dqs_list_type    *lp=NULL;
  dqs_list_type    *response_list=NULL;
  
  
  DENTER((DQS_EVENT,"dqs_report_rusage"));
  
  if (!Rusage_head) {
    INFO((DQS_EVENT,"DQS_ERROR_0227 No rusage stats to report"));
    DEXIT;
    return(0);
  }
  
  if ((sfd=dqs_send_list(me.default_cell,conf.qmaster_service,sfd,Rusage_head))<0)
    {
      DEXITE;
      return(-1);
    }
  
  if (dqs_get_list(sfd,&response_list))
    {
      DEXITE;
      return(-1);
    }
  dqs_close_sfd(sfd);
  
  status=response_list->status;
  
  if (status!=DQS_ACK)
    {
      response_list=dqs_free_list(response_list);
      DEXITE;
      return(-1);
    }
  
  if( !dqs_ifequal_tid(Rusage_head->tid,response_list->tid) ){
    if( (!Rusage_head->tid) || (!response_list->tid) ) {
      ERROR((DQS_EVENT,"DQS_ERROR_0228 TID error... no tid") );
    }
    else {
      ERROR((DQS_EVENT,"DQS_ERROR_0229 TID error  %d %d %d %d %s %s",
	     Rusage_head->tid->int0,response_list->tid->int0,
	     Rusage_head->tid->int1,response_list->tid->int1,
	     Rusage_head->tid->str0,response_list->tid->str0));
    }
    
  }
  response_list=dqs_free_list(response_list);
  lp=Rusage_head;
  while (lp)
    {
      dqs_unlink(RUSAGE_DIR,lp->str1);
      lp=lp->next;
    }
  
  Rusage_head=dqs_free_list(Rusage_head);
  
  DEXIT;
  return(0);
  
}

/************************************************************************************/
int dqs_start_jobs()
     
{
  
  int              status;
  dqs_list_type    *listel_ptr=NULL;
  dqs_list_type    *listel_ptr2=NULL;
  dqs_list_type    listel;
  dqs_rusage_type  dusage;
  
  
  DENTER((DQS_EVENT,"dqs_start_jobs"));
  
  if (!Job_head) {
    DPRINTF((DQS_EVENT,"No jobs to start"));
    DEXITE;
    return(0);
  }
  
  listel_ptr=Job_head;
  
  while (listel_ptr)
    {
      if (listel_ptr->job->status==IDLE)
	{
	  DPRINTF((DQS_EVENT,"dqs_start_jobs() listel_ptr->job->dqs_job_name>%s<",listel_ptr->job->dqs_job_name));
	  bzero((char *)&listel,sizeof(listel));
	  listel_ptr->job->status=RUNNING;
	  listel_ptr->job->start_time=dqs_get_gmt();
	  listel_ptr->job->pid=dqs_exec_job(listel_ptr);
	  
	  DPRINTF((DQS_EVENT,"***EXECING \"%s\"on %s ",listel_ptr->job->dqs_job_name,me.unqualified_hostname));
	  if (listel_ptr->job->pid<0) 
	    { /* this could only happen if we are out of PIDs/fds/memory/etc - eg: very bad stuff */
	      /* as such we bail */
	      CRITICAL((DQS_EVENT,"DQS_ERROR_0230 error: couldn't exec jid %s",listel_ptr->job->dqs_job_name));
	      
	      
	      listel_ptr->str0=dqs_string_insert(listel_ptr->str0,
						 "failure in start jobs, cannot exec the job");
	      dqs_report_problem( listel_ptr,  -1 );                 
	    }
	}
      listel_ptr=listel_ptr->next;           
    }
  
  DEXIT;
  return(0);
  
}
