/*
*         OpenPBS (Portable Batch System) v2.3 Software License
*
* Copyright (c) 1999-2000 Veridian Information Solutions, Inc.
* All rights reserved.
*
* ---------------------------------------------------------------------------
* For a license to use or redistribute the OpenPBS software under conditions
* other than those described below, or to purchase support for this software,
* please contact Veridian Systems, PBS Products Department ("Licensor") at:
*
*    www.OpenPBS.org  +1 650 967-4675                  sales@OpenPBS.org
*                        877 902-4PBS (US toll-free)
* ---------------------------------------------------------------------------
*
* This license covers use of the OpenPBS v2.3 software (the "Software") at
* your site or location, and, for certain users, redistribution of the
* Software to other sites and locations.  Use and redistribution of
* OpenPBS v2.3 in source and binary forms, with or without modification,
* are permitted provided that all of the following conditions are met.
* After December 31, 2001, only conditions 3-6 must be met:
*
* 1. Commercial and/or non-commercial use of the Software is permitted
*    provided a current software registration is on file at www.OpenPBS.org.
*    If use of this software contributes to a publication, product, or
*    service, proper attribution must be given; see www.OpenPBS.org/credit.html
*
* 2. Redistribution in any form is only permitted for non-commercial,
*    non-profit purposes.  There can be no charge for the Software or any
*    software incorporating the Software.  Further, there can be no
*    expectation of revenue generated as a consequence of redistributing
*    the Software.
*
* 3. Any Redistribution of source code must retain the above copyright notice
*    and the acknowledgment contained in paragraph 6, this list of conditions
*    and the disclaimer contained in paragraph 7.
*
* 4. Any Redistribution in binary form must reproduce the above copyright
*    notice and the acknowledgment contained in paragraph 6, this list of
*    conditions and the disclaimer contained in paragraph 7 in the
*    documentation and/or other materials provided with the distribution.
*
* 5. Redistributions in any form must be accompanied by information on how to
*    obtain complete source code for the OpenPBS software and any
*    modifications and/or additions to the OpenPBS software.  The source code
*    must either be included in the distribution or be available for no more
*    than the cost of distribution plus a nominal fee, and all modifications
*    and additions to the Software must be freely redistributable by any party
*    (including Licensor) without restriction.
*
* 6. All advertising materials mentioning features or use of the Software must
*    display the following acknowledgment:
*
*     "This product includes software developed by NASA Ames Research Center,
*     Lawrence Livermore National Laboratory, and Veridian Information
*     Solutions, Inc.
*     Visit www.OpenPBS.org for OpenPBS software support,
*     products, and information."
*
* 7. DISCLAIMER OF WARRANTY
*
* THIS SOFTWARE IS PROVIDED "AS IS" WITHOUT WARRANTY OF ANY KIND. ANY EXPRESS
* OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT
* ARE EXPRESSLY DISCLAIMED.
*
* IN NO EVENT SHALL VERIDIAN CORPORATION, ITS AFFILIATED COMPANIES, OR THE
* U.S. GOVERNMENT OR ANY OF ITS AGENCIES BE LIABLE FOR ANY DIRECT OR INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
* OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* This license will be governed by the laws of the Commonwealth of Virginia,
* without reference to its choice of law rules.
*/

#include <pbs_config.h>   /* the master config generated by configure */
#include "node_manager.h"

#include <string>
#include <sstream>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
#include <ctype.h>
#include <errno.h>
#include <time.h>
#include <sys/types.h>
#include <netinet/in.h>
#include <stdarg.h>
#include <assert.h>
#if defined(NTOHL_NEEDS_ARPA_INET_H) && defined(HAVE_ARPA_INET_H)
#include <arpa/inet.h>
#endif
#include <vector>

#include "portability.h"
#include "libpbs.h"
#include "server_limits.h"
#include "list_link.h"
#include "attribute.h"
#include "resource.h"
#include "server.h"
#include "net_connect.h"
#include "batch_request.h"
#include "work_task.h"
#include "svrfunc.h"
#include "pbs_job.h"
#include "log.h"
#include "../lib/Liblog/pbs_log.h"
#include "../lib/Liblog/log_event.h"
#include "pbs_nodes.h"
#include "dis.h"
#include "dis_init.h"
#include "resmon.h"
#include "mcom.h"
#include "utils.h"
#include "u_tree.h"
#include "threadpool.h"
#include "node_func.h" /* find_nodebyname */
#include "../lib/Libutils/u_lock_ctl.h" /* lock_node, unlock_node */
#include "../lib/Libnet/lib_net.h" /* socket_read_flush */
#include "../lib/Libutils/lib_utils.h" /* have_incompatible_dash_l_resource */
#include "svr_func.h" /* get_svr_attr_* */
#include "alps_functions.h"
#include "login_nodes.h"
#include "svr_connect.h" /* svr_disconnect_sock */
#include "net_cache.h"
#include "ji_mutex.h"
#include "alps_constants.h"
#include "mutex_mgr.hpp"
#include "timer.hpp"
#include "id_map.hpp"
#ifdef PENABLE_LINUX_CGROUPS
#include "complete_req.hpp"
#endif
#ifdef NVML_API
#include <nvml.h>
#endif

#define IS_VALID_STR(STR)  (((STR) != NULL) && ((STR)[0] != '\0'))

extern int              LOGLEVEL;

#if !defined(H_ERRNO_DECLARED) && !defined(_AIX)
/*extern int              h_errno;*/
#endif
  

int                     svr_totnodes = 0; /* total number nodes defined       */
int                     svr_unresolvednodes = 0; /* number of nodes from the nodes file that failed to resolve. */
int                     svr_clnodes  = 0; /* number of cluster nodes     */
int                     svr_chngNodesfile = 0; /* 1 signals want nodes file update */
int                     gpu_mode_rqstd = -1;  /* default gpu mode requested */
int                     gpu_err_reset = FALSE;    /* was a gpu errcount reset requested */
/* on server shutdown, (qmgr mods)  */

all_nodes               allnodes;

static int              num_addrnote_tasks = 0; /* number of outstanding send_cluster_addrs tasks */
pthread_mutex_t        *addrnote_mutex = NULL;

extern int              server_init_type;
extern int              has_nodes;

extern int create_a_gpusubnode(struct pbsnode *);
int        is_gpustat_get(struct pbsnode *np, char **str_ptr);

extern int              ctnodes(char *);

extern char            *path_home;
extern char            *path_nodes;
extern char            *path_node_usage;
extern char            *path_nodes_new;
extern char            *path_nodestate;
extern char            *path_nodepowerstate;
extern char            *path_nodenote;
extern char            *path_nodenote_new;
extern char             server_name[];

extern struct server    server;
extern tlist_head       svr_newnodes;
extern attribute_def    node_attr_def[];   /* node attributes defs */
extern int              SvrNodeCt;

extern int              multi_mom;

const int               network_fail_wait_time = 300;
int                     default_gpu_mode = -1;

#define SKIP_NONE       0
#define SKIP_EXCLUSIVE  1
#define SKIP_ANYINUSE   2
#define SKIP_NONE_REUSE 3

#ifndef MAX_BM
#define MAX_BM          64
#endif

int handle_complete_first_time(job *pjob);
int is_compute_node(char *node_id);
int hasprop(struct pbsnode *, struct prop *);
int node_satisfies_request(struct pbsnode *,char *);
int reserve_node(struct pbsnode *, job *, char *, job_reservation_info &);
int procs_available(int proc_ct);
void check_nodes(struct work_task *);
int gpu_entry_by_id(struct pbsnode *,char *, int);
job *get_job_from_jobinfo(struct jobinfo *,struct pbsnode *);
int remove_job_from_node(struct pbsnode *pnode, int internal_job_id);

/* marks a stream as finished being serviced */
pthread_mutex_t        *node_state_mutex = NULL;





/**
**      Modified by Tom Proett <proett@nas.nasa.gov> for PBS.
*/

AvlTree                 ipaddrs = NULL;


/**
 * specialized version of tfind for looking in the ipadders tree
 * @param key - the node we are searching for
 * @return a pointer to the pbsnode
*/

struct pbsnode *tfind_addr(

  const u_long  key,
  uint16_t      port,
  char         *job_momname)

  {
  struct pbsnode *pn = AVL_find(key,port,ipaddrs);

  if (pn == NULL)
    return(NULL);

  lock_node(pn, __func__, "pn", LOGLEVEL);

  if ((pn->num_node_boards == 0) ||
      (job_momname == NULL))
    return(pn);
  else
    {
    char *dash = NULL;
    char *plus = NULL;
    char *tmp = job_momname;

    struct pbsnode *numa = NULL;

    int index;

    plus = strchr(tmp,'+');

    if (plus != NULL)
      *plus = '\0';

    while ((tmp = strchr(tmp,'-')) != NULL)
      {
      dash = tmp;
      tmp++;
      }

    if (dash == NULL)
      {
      /* node has numa nodes but no dashes in exec host?? */
      log_err(-1, __func__, "Numa node but there's no dash in exec_host?");

      return(pn);
      }

    index = atoi(dash+1);

    numa = AVL_find(index, pn->nd_mom_port, pn->node_boards);

    unlock_node(pn, __func__, "pn->numa", LOGLEVEL);

    if (numa != NULL)
      lock_node(numa, __func__, "numa", LOGLEVEL);

    if (plus != NULL)
      *plus = '+';

    return(numa);
    }
  } /* END tfind_addr() */



/*
 * Removes a specific flag from the node state
 */

void remove_node_state_flag(
    
  struct pbsnode *pnode,
  int             flag)

  {
  pnode->nd_state &= ~flag;
  } // END remove_node_state_flag()



void check_node_jobs_existence(
    
  struct work_task *pwt)

  {
  char *node_name = (char *)pwt->wt_parm1;

  free(pwt->wt_mutex);
  free(pwt);

  pbsnode *pnode = find_nodebyname(node_name);

  if (pnode != NULL)
    {
    std::vector<int> internal_ids;
    std::vector<int> ids_to_remove;
    
    for (size_t i = 0; i < pnode->nd_job_usages.size(); i++)
      internal_ids.push_back(pnode->nd_job_usages[i].internal_job_id);

    unlock_node(pnode, __func__, "", LOGLEVEL);

    for (size_t i = 0; i < internal_ids.size(); i++)
      {
      // Job doesn't exist, mark this usage record for removal
      if (internal_job_id_exists(internal_ids[i]) == false)
        ids_to_remove.push_back(internal_ids[i]);
      }

    if (ids_to_remove.size() > 0)
      {
      pbsnode *pnode = find_nodebyname(node_name);

      if (pnode != NULL)
        {
        // Erase non-existent job ids
        for (size_t i = 0; i < ids_to_remove.size(); i++)
          remove_job_from_node(pnode, ids_to_remove[i]);
    
        unlock_node(pnode, __func__, "", LOGLEVEL);
        }
      }
    }

  free(node_name);
  } // END check_node_jobs_existence()



/* update_node_state - central location for updating node state */
/* NOTE:  called each time a node is marked down, each time a MOM reports node  */
/*        status, and when pbs_server sends hello/cluster_addrs */

void update_node_state(

  struct pbsnode *np,         /* I (modified) */
  int             newstate)   /* I (one of INUSE_*) */

  {
  char            log_buf[LOCAL_LOG_BUF_SIZE];

  /* No need to do anything if newstate == oldstate */
  if (np->nd_state == newstate)
    return;

  /*
   * LOGLEVEL >= 4 logs all state changes
   *          >= 2 logs down->(busy|free) changes
   *          (busy|free)->down changes are always logged
   */

  if (LOGLEVEL >= 4)
    {
    sprintf(log_buf, "adjusting state for node %s - state=%d, newstate=%d",
      (np->nd_name != NULL) ? np->nd_name : "NULL",
      np->nd_state,
      newstate);

    log_record(PBSEVENT_SCHED, PBS_EVENTCLASS_REQUEST, __func__, log_buf);
    }

  log_buf[0] = '\0';

  //Node state can't change until the hierarchy has been sent.
  if (np->nd_state & INUSE_NOHIERARCHY)
    {
    sprintf(log_buf, "node %s has not received its list of nodes yet.",
      (np->nd_name != NULL) ? np->nd_name : "NULL");

    }

  if (newstate & INUSE_DOWN)
    {
    if (!(np->nd_state & INUSE_DOWN))
      {
      sprintf(log_buf, "node %s marked down",
        (np->nd_name != NULL) ? np->nd_name : "NULL");

      np->nd_state |= INUSE_DOWN;
      np->nd_state &= ~INUSE_UNKNOWN;
      }

    /* ignoring the obvious possibility of a "down,busy" node */
    }  /* END if (newstate & INUSE_DOWN) */
  else if (newstate & INUSE_BUSY)
    {
    if ((!(np->nd_state & INUSE_BUSY) && (LOGLEVEL >= 4)) ||
        ((np->nd_state & INUSE_DOWN) && (LOGLEVEL >= 2)))
      {
      sprintf(log_buf, "node %s marked busy",
        (np->nd_name != NULL) ? np->nd_name : "NULL");
      }

    np->nd_state |= INUSE_BUSY;

    np->nd_state &= ~INUSE_UNKNOWN;

    if (np->nd_state & INUSE_DOWN)
      {
      np->nd_state &= ~INUSE_DOWN;
      }
    }  /* END else if (newstate & INUSE_BUSY) */
  else if (newstate == INUSE_FREE)
    {
    if (((np->nd_state & INUSE_DOWN) && (LOGLEVEL >= 2)) ||
        ((np->nd_state & INUSE_BUSY) && (LOGLEVEL >= 4)))
      {
      sprintf(log_buf, "node %s marked free",
        (np->nd_name != NULL) ? np->nd_name : "NULL");
      }

    np->nd_state &= ~INUSE_BUSY;
    np->nd_state &= ~INUSE_UNKNOWN;
    np->nd_state &= ~INUSE_DOWN;

    set_task(WORK_Immed, 0, check_node_jobs_existence, strdup(np->nd_name), FALSE);
    }    /* END else if (newstate == INUSE_FREE) */
  else if (newstate & INUSE_NETWORK_FAIL)
    {
    np->nd_state |= INUSE_NETWORK_FAIL;
    }

  if (newstate & INUSE_UNKNOWN)
    {
    np->nd_state |= INUSE_UNKNOWN;
    }

  if ((LOGLEVEL >= 2) && (log_buf[0] != '\0'))
    {
    log_record(PBSEVENT_SCHED, PBS_EVENTCLASS_REQUEST, __func__, log_buf);
    }

  return;
  }  /* END update_node_state() */




int check_node_for_job(

  struct pbsnode *pnode,
  int             internal_job_id)

  {
  for (int i = 0; i < (int)pnode->nd_job_usages.size(); i++)
    {
    const job_usage_info &jui = pnode->nd_job_usages[i];

    if (internal_job_id == jui.internal_job_id)
      return(TRUE);
    }

  /* not found */
  return(FALSE);
  } /* END check_node_for_job() */




/*
 * is_job_on_node - return TRUE if this internal job id is present on pnode
 */

int is_job_on_node(

  struct pbsnode *pnode,           /* I */
  int             internal_job_id) /* I */

  {
  struct pbsnode *numa;

  int             present = FALSE;
  int             i;

  if (pnode->num_node_boards > 0)
    {
    /* check each subnode on each numa node for the job */
    for (i = 0; i < pnode->num_node_boards; i++)
      {
      numa = AVL_find(i,pnode->nd_mom_port,pnode->node_boards);

      lock_node(numa, __func__, "before check_node_for_job numa", LOGLEVEL);
      present = check_node_for_job(pnode, internal_job_id);
      unlock_node(numa, __func__, "after check_node_for_job numa", LOGLEVEL);

      /* leave loop if we found the job */
      if (present != FALSE)
        break;
      } /* END for each numa node */
    }
  else
    {
    present = check_node_for_job(pnode, internal_job_id);
    }

  return(present);
  }  /* END is_job_on_node() */




/*
 * If nodes have similiar names this will make sure the name is an exact match.
 * Not just found inside another name.
 * i.e. Machines by the name of gpu, gpuati, gpunvidia. If searching for gpu...
 * List format is similiar to: gpuati+gpu/1+gpunvidia/4+gpu/5
 */

bool node_in_exechostlist(
    
  const char *node_name,
  char       *node_ehl,
  const char *login_node_name)

  {
  bool  found = false;
  char *cur_pos = node_ehl;
  char *new_pos = cur_pos;
  int   name_len = strlen(node_name);
  long  cray_enabled = FALSE;

  get_svr_attr_l(SRV_ATR_CrayEnabled, &cray_enabled);

  if (cray_enabled == TRUE)
    {
    if ((login_node_name != NULL) &&
        (!strcmp(login_node_name, node_name)))
      found = true;
    }

  while (found == false)
    {
    if ((new_pos = strstr(cur_pos, node_name)) == NULL)
      break;
    else if (new_pos == node_ehl)
      {
      if ((new_pos+name_len == NULL) ||
          (*(new_pos+name_len) == '+') ||
          (*(new_pos+name_len) == '/'))
        {
        found = true;
        break;
        }
      }
    else if (*(new_pos-1) == '+')
      {
      if ((new_pos+name_len == NULL) ||
          (*(new_pos+name_len) == '+') ||
          (*(new_pos+name_len) == '/'))
        {
        found = true;
        break;
        }
      }

    cur_pos = new_pos+1;
    }

  return(found);
  } /* END node_in_exechostlist() */




int kill_job_on_mom(

  const char     *job_id,
  struct pbsnode *pnode)

  {
  batch_request *preq;
  int            rc = -1;
  int            conn;
  int            local_errno = 0;
  char           log_buf[LOCAL_LOG_BUF_SIZE];
  std::string    node_name(pnode->nd_name);

  /* job is reported by mom but server has no record of job */
  sprintf(log_buf, "stray job %s found on %s", job_id, pnode->nd_name);
  log_err(-1, __func__, log_buf);
  
  conn = svr_connect(pnode->nd_addrs[0], pnode->nd_mom_port, &local_errno, pnode, NULL);

  if (conn >= 0)
    {
    if ((preq = alloc_br(PBS_BATCH_SignalJob)) == NULL)
      {
      log_err(-1, __func__, "unable to allocate SignalJob request-trouble!");
      svr_disconnect(conn);
      }
    else
      {
      snprintf(preq->rq_ind.rq_signal.rq_jid, sizeof(preq->rq_ind.rq_signal.rq_jid), "%s", job_id);
      snprintf(preq->rq_ind.rq_signal.rq_signame, sizeof(preq->rq_ind.rq_signal.rq_signame), "SIGKILL");
      preq->rq_extra = strdup(SYNC_KILL);
      tmp_unlock_node(pnode, __func__, NULL, LOGLEVEL);
      rc = issue_Drequest(conn, preq, true);

      if (preq->rq_reply.brp_code == PBSE_TIMEOUT)
        update_failure_counts(node_name.c_str(), PBSE_TIMEOUT);
      else
        update_failure_counts(node_name.c_str(), 0);

      free_br(preq);
      tmp_lock_node(pnode, __func__, NULL, LOGLEVEL);
      }
    }
  else
    {
    tmp_unlock_node(pnode, __func__, NULL, LOGLEVEL);
    update_failure_counts(node_name.c_str(), -1);
    tmp_lock_node(pnode, __func__, NULL, LOGLEVEL);
    }

  return(rc);
  } /* END kill_job_on_mom() */



pthread_mutex_t jobsKilledMutex = PTHREAD_MUTEX_INITIALIZER;
std::vector<int> jobsKilled;
#define JOB_SYNC_TIMEOUT 60 //Once a kill job has been sent to a MOM, don't send another for five minutes.



/*
 * Delayed task to remove a killed job from the list in
 * case it needs to be removed again.
 */

void remove_job_from_already_killed_list(
    
  struct work_task *pwt)

  {
  int *to_free = (int *)pwt->wt_parm1;
  int  job_internal_id = *to_free;

  delete to_free;
  free(pwt->wt_mutex);
  free(pwt);

  pthread_mutex_lock(&jobsKilledMutex);

  for (unsigned int i = 0; i < jobsKilled.size(); i++)
    {
    if (jobsKilled[i] == job_internal_id)
      {
      jobsKilled.erase(jobsKilled.begin() + i);
      break;
      }
    }

  pthread_mutex_unlock(&jobsKilledMutex);
  } /* END remove_job_from_already_killed_list() */



bool job_already_being_killed(

  int job_internal_id)

  {
  bool jobAlreadyKilled = false;
  // Job should not be on the node, see if we have already sent a kill for this job.
  pthread_mutex_lock(&jobsKilledMutex);

  for (unsigned int i = 0; i < jobsKilled.size(); i++)
    {
    if (jobsKilled[i] == job_internal_id)
      {
      jobAlreadyKilled = true;
      break;
      }
    }

  pthread_mutex_unlock(&jobsKilledMutex);

  return(jobAlreadyKilled);
  } /* END job_already_being_killed() */



/*
 * If a job is not supposed to be on a node and we have
 * not sent a kill to that job in the last 5 minutes
 * then the job should be killed.
 */

bool job_should_be_killed(

  std::string    &job_id,    
  int             internal_job_id,
  struct pbsnode *pnode)

  {
  bool  should_be_on_node = true;
  bool  should_kill_job = false;
  job  *pjob;
  
  if ((is_job_on_node(pnode, internal_job_id)) == FALSE)
    {
    /* must lock the job before the node */

    tmp_unlock_node(pnode, __func__, NULL, LOGLEVEL);
    if ((pjob = svr_find_job_by_id(internal_job_id)) == NULL)
      pjob = svr_find_job(job_id.c_str(), TRUE);
    tmp_lock_node(pnode, __func__, NULL, LOGLEVEL);
    
    if (pjob != NULL)
      {
      /* job exists, but doesn't currently have resources assigned
       * to this node double check the job struct because we
       * could be in the middle of moving the job around because
       * of data staging, suspend, or rerun */            
      mutex_mgr job_mgr(pjob->ji_mutex,true);
      if (pjob->ji_wattr[JOB_ATR_exec_host].at_val.at_str == NULL)
        {
        should_be_on_node = false;
        }
      else if (node_in_exechostlist(pnode->nd_name,
                                    pjob->ji_wattr[JOB_ATR_exec_host].at_val.at_str,
                                    pjob->ji_wattr[JOB_ATR_login_node_id].at_val.at_str) == false)
        {
        should_be_on_node = false;
        }

      if (should_be_on_node == false)
        should_kill_job = !job_already_being_killed(internal_job_id);
      }
    else
      // if the job doesn't exist to pbs_server force a kill
      should_kill_job = true;
    }


  return(should_kill_job);
  } /* END job_should_be_killed() */



void *finish_job(

  void *vp)

  {
  char *jobid = (char *)vp;
  job  *pjob;

  if (jobid == NULL)
    return(NULL);
  else if ((pjob = svr_find_job(jobid, TRUE)) == NULL)
    {
    free(jobid);

    return(NULL);
    }
  mutex_mgr job_mgr(pjob->ji_mutex,true);
  job_mgr.set_unlock_on_exit(false);

  free(jobid);

  free_nodes(pjob);

  svr_setjobstate(pjob, JOB_STATE_COMPLETE, JOB_SUBSTATE_COMPLETE, FALSE);

  handle_complete_first_time(pjob);
  /* pjob->ji_mutex is always returned unlocked from handle_complete_first_time */

  return(NULL);
  } /* END finish_job() */



/*
 * is_jobid_in_mom()
 * returns: true if jobid was found; false otherwise.
 */
bool is_jobid_in_mom(

  const char *jobs,
  const char *jobid)

  {
  char *joblist = strdup(jobs);
  char *jobptr = joblist;
  char *jobidstr = NULL;

  jobidstr = threadsafe_tokenizer(&jobptr, " ");
  while (jobidstr != NULL)
    {
    if (strcmp(jobid, jobidstr) == 0)
      {
      free(joblist);
      return(true);
      }

    jobidstr = threadsafe_tokenizer(&jobptr, " ");
    }

  free(joblist);

  return(false);
  } /* END is_jobid_in_mom() */




/*
 * sync_node_jobs_with_moms() - remove any jobs in the pbsnode (np) that was not
 * reported by the mom that it's currently running in its status update.
 */
void sync_node_jobs_with_moms(

  struct pbsnode *np,        /* I */
  const char *jobs_in_mom)   /* I */

  {
  std::vector<int> jobsRemoveFromNode;
  bool removealljobs = (strlen(jobs_in_mom) == 0);

  for (int i = 0; i < (int)np->nd_job_usages.size(); i++)
    {
    bool            removejob = false;
    // this one has to be a copy instead of a reference because we lose the mutex
    // below which can make the pointer invalid
    job_usage_info  jui = np->nd_job_usages[i];
    const char     *jobid = job_mapper.get_name(jui.internal_job_id);
    int             internal_job_id = jui.internal_job_id;

    if (!removealljobs)
      {
      char *p = strstr((char *)jobs_in_mom, jobid);
      /* job is in the node but not in mom */
      if (!p)
        removejob = true;
      else if (is_jobid_in_mom(jobs_in_mom, jobid) == false)
        removejob = true;
      }
    if (removejob || removealljobs)
      {
      tmp_unlock_node(np, __func__, NULL, LOGLEVEL);
      job *pjob = svr_find_job(jobid, TRUE);
      tmp_lock_node(np, __func__, NULL, LOGLEVEL);
      if (pjob)
        unlock_ji_mutex(pjob, __func__, NULL, LOGLEVEL);
      else
        jobsRemoveFromNode.push_back(internal_job_id);
      }
    }

  char log_buf[LOCAL_LOG_BUF_SIZE + 1];
  for (unsigned int i = 0; i < jobsRemoveFromNode.size(); i++)
    {
    snprintf(log_buf, sizeof(log_buf),
      "Job %s was not reported in %s update status. Freeing job from node.",
      job_mapper.get_name(jobsRemoveFromNode[i]), np->nd_name);
    log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, log_buf);
    remove_job_from_node(np, jobsRemoveFromNode[i]);
    }
  } /* end of sync_node_jobs_with_moms */






/*
 * process_job_attribute_information()
 *
 * @post-cond: the job with id job_id has its attribute values updated to match
 * the values parsed from attributes
 * @param attributes - a string object with job attributes and values in the 
 * format (name1=val1[,name2=val2[...]])
 * @param jobid - a string object containing the job's id
 */
void process_job_attribute_information(
    
  std::string &job_id,
  std::string &attributes)

  {
  char *job_id_dup = strdup(job_id.c_str());
  char *attr_dup = strdup(attributes.c_str());
  // move past '(' at front
  char *attr_work = attr_dup + 1;
  job  *pjob;

  // remove the ')' at the end
  char *paren = strrchr(attr_work, ')');

  if (paren != NULL)
    *paren = '\0';

  if ((pjob = svr_find_job(job_id_dup, TRUE)) != NULL)
    {
    mutex_mgr job_mutex(pjob->ji_mutex, true);
    char *attr_val = threadsafe_tokenizer(&attr_work, ",");
    
    while (attr_val != NULL)
      {
      char *attr_name = threadsafe_tokenizer(&attr_val, "=");

      if ((attr_name != NULL) &&
          (attr_val != '\0'))
        {
        if (str_to_attr(attr_name, attr_val, pjob->ji_wattr, job_attr_def, JOB_ATR_LAST) == ATTR_NOT_FOUND)
          {
          // should be resources used if not found as attribute
          decode_resc(&(pjob->ji_wattr[JOB_ATR_resc_used]), ATTR_used, attr_name, attr_val, ATR_DFLAG_ACCESS);
          }
        }

      attr_val = threadsafe_tokenizer(&attr_work, ",");
      }

    pjob->ji_last_reported_time = time(NULL);
    }

  free(job_id_dup);
  free(attr_dup);
  } /* END process_job_attribute_information() */



/*
 * sync_node_jobs() - determine if a MOM has a stale job and possibly delete it
 *
 * This function is called every time we get a node stat from the pbs_mom.
 *
 * NOTE: changed to be processed in a thread so that processing here doesn't hinder
 * the server's ability to reply to the status
 *
 * @see is_stat_get()
 */

void *sync_node_jobs(

  void *vp)

  {
  struct pbsnode       *np;
  sync_job_info        *sji = (sync_job_info *)vp;
  char                 *raw_input;
  char                 *node_id;
  char                 *jobstring_in;
  char                 *joblist;
  char                 *jobidstr;
  long                  job_sync_timeout = JOB_SYNC_TIMEOUT;
  char                 *jobs_in_mom = NULL;

  if (vp == NULL)
    return(NULL);

  raw_input = sji->input;

  free(sji);

  /* raw_input's format is:
   *   node name:<JOBID>(resource_name=usage_val[,resource_name2=usage_val2...])[ <JOBID>]... */
  if ((jobstring_in = strchr(raw_input, ':')) != NULL)
    {
    node_id = raw_input;
    *jobstring_in = '\0';
    jobstring_in++;
    }
  else
    {
    /* bad input */
    free(raw_input);

    return(NULL);
    }
    
  if ((np = find_nodebyname(node_id)) == NULL)
    {
    free(raw_input);

    return(NULL);
    }

  pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, 0);

  /* FORMAT <JOBID>[ <JOBID>]... */
  jobs_in_mom = strdup(jobstring_in);
  joblist = jobstring_in;
  jobidstr = threadsafe_tokenizer(&joblist, " ");

  get_svr_attr_l(SRV_ATR_job_sync_timeout, &job_sync_timeout);

  while ((jobidstr != NULL) && 
         (isdigit(*jobidstr)) != FALSE)
    {
    std::string job_id(jobidstr);
    size_t      pos;
    int         internal_job_id;

    if ((pos = job_id.find("(")) != std::string::npos)
      {
      std::string attributes = job_id.substr(pos);
      job_id.erase(pos);

      // must unlock the node to lock the job in this sub-function
      unlock_node(np, __func__, NULL, LOGLEVEL);
      process_job_attribute_information(job_id, attributes);

      // re-lock the node
      if ((np = find_nodebyname(node_id)) == NULL)
        {
        free(raw_input);

        if (jobs_in_mom)
          free(jobs_in_mom);
  
        pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, 0);

        return(NULL);
        }
      }

    internal_job_id = job_mapper.get_id(job_id.c_str());

    if (internal_job_id == -1)
      {
      /* log a message if it's at loglevel 7 and proceed to kill the job.
      */
      if (LOGLEVEL >= 7)
        {
        char log_buf[LOCAL_LOG_BUF_SIZE];
        sprintf(log_buf, "jobid: %s not found in job_mapper", job_id.c_str());
        log_ext(-1, __func__, log_buf, LOG_WARNING);
        }
      }
    if (job_should_be_killed(job_id, internal_job_id, np))
      {
      if (kill_job_on_mom(job_id.c_str(), np) == PBSE_NONE)
        {
        pthread_mutex_lock(&jobsKilledMutex);
        jobsKilled.push_back(internal_job_id);
        pthread_mutex_unlock(&jobsKilledMutex);

        int *dup_id = new int(internal_job_id);
        set_task(WORK_Timed, 
                 time(NULL) + job_sync_timeout,
                 remove_job_from_already_killed_list,
                 dup_id,
                 FALSE);
        }
      }
    
    jobidstr = threadsafe_tokenizer(&joblist, " ");
    } /* END while ((jobidstr != NULL) && ...) */

  /* SUCCESS */
  free(raw_input);

  if (jobs_in_mom)
    {
    sync_node_jobs_with_moms(np, jobs_in_mom);
    free(jobs_in_mom);
    }

  unlock_node(np, __func__, NULL, LOGLEVEL);
  
  pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, 0);

  return(NULL);
  }  /* END sync_node_jobs() */



/*
 *      setup_notification -  Sets up the  mechanism for notifying
 *                            other members of the server's node
 *                            pool that a new node was added manually
 *                            via qmgr.  Actual notification occurs some
 *                            time later through the send_cluster_addrs mechanism
 */

void setup_notification(
    
  char *pname) /* I */

  {
  struct pbsnode *pnode;
  new_node       *nnew;

  if (pname != NULL)
    {
    if ((pnode = find_nodebyname(pname)) != NULL)
      {
      /* call it offline until after all nodes get the new ipaddr */
      pnode->nd_state |= INUSE_OFFLINE;
      
      nnew = (new_node *)calloc(1, sizeof(new_node));
      
      if (nnew == NULL)
        {
        unlock_node(pnode, __func__, "nnew == NULL", LOGLEVEL);
        return;
        }
      
      CLEAR_LINK(nnew->nn_link);
      
      nnew->nn_name = strdup(pname);
      
      append_link(&svr_newnodes, &nnew->nn_link, nnew);
      
      unlock_node(pnode, __func__, "nnew != NULL", LOGLEVEL);
      }
    }

  if (addrnote_mutex == NULL)
    {
    addrnote_mutex = (pthread_mutex_t *)calloc(1, sizeof(pthread_mutex_t));
    pthread_mutex_init(addrnote_mutex,NULL);
    }

  pthread_mutex_lock(addrnote_mutex);
  num_addrnote_tasks++;
  pthread_mutex_unlock(addrnote_mutex);

  return;
  } /* END setup_notification() */







/*
 **     reset gpu data in case mom reconnects with changed gpus.
 **     If we have real gpus, not virtual ones, then clear out gpu_status,
 **     gpus count and remove gpu subnodes.
 */

void clear_nvidia_gpus(

  struct pbsnode *np)  /* I */

  {
  pbs_attribute   temp;

  if ((np->nd_gpus_real) && (np->nd_ngpus > 0))
    {
    /* delete gpusubnodes by freeing it */
    free(np->nd_gpusn);
    np->nd_gpusn = NULL;

    /* reset # of gpus, etc */
    np->nd_ngpus = 0;
    np->nd_ngpus_free = 0;

    /* unset "gpu_status" node attribute */

    memset(&temp, 0, sizeof(temp));

    if (decode_arst(&temp, NULL, NULL, NULL, 0))
      {
      log_err(-1, __func__, "cannot initialize attribute\n");

      return;
      }

    node_gpustatus_list(&temp, np, ATR_ACTION_ALTER);
    }

  return;
  }  /* END clear_nvidia_gpus() */





/* EOF on a stream received (either stream or addr must be specified) */
/* mark node down */
/* NOTE: pass in stream = -1 if you wish the stream to be optional */

void stream_eof(

  int       stream,  /* I (optional) */
  u_long    addr,  /* I (optional) */
  uint16_t  port,  /* I (optional) */
  int       ret)     /* I (ignored) */

  {
  char            log_buf[LOCAL_LOG_BUF_SIZE];
  int             conn;
  int             my_err = 0;

  struct pbsnode *np = NULL;

  if (LOGLEVEL >= 6)
    {
    sprintf(log_buf, "stream: %d, addr: %ld, port %d", stream, addr, port);
    LOG_EVENT(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, log_buf);
    }

  if (addr != 0)
    {
    np = AVL_find(addr, port, ipaddrs);
    }

  if (np == NULL)
    {
    /* cannot locate node */

    return;
    }

  /* Before we mark this node down see if we can connect */
  lock_node(np, __func__, "parent", LOGLEVEL);
  conn = svr_connect(addr, port, &my_err, np, NULL);
  if(conn >= 0)
    {
    unlock_node(np, __func__, "parent", LOGLEVEL);
    svr_disconnect(conn);
    return;
    }


  sprintf(log_buf,
    "connection to %s is no longer valid, connection may have been closed remotely, remote service may be down, or message may be corrupt (%s).  setting node state to down",
    np->nd_name,
    dis_emsg[ret]);

  log_err(-1, __func__, log_buf);

  /* mark node and all subnodes as down */

  if (np->num_node_boards > 0)
    {
    int             i;
    struct pbsnode *pnode;

    for (i = 0; i < np->num_node_boards; i++)
      {
      pnode = AVL_find(i,np->nd_mom_port,np->node_boards);

      lock_node(pnode, __func__, "subs", LOGLEVEL);
      update_node_state(pnode,INUSE_DOWN);
      unlock_node(pnode, __func__, "subs", LOGLEVEL);
      }
    }
  else
    update_node_state(np, INUSE_DOWN);

  unlock_node(np, __func__, "parent", LOGLEVEL);

  return;
  }  /* END stream_eof() */


/*
 * wrapper task that check_nodes places in the thread pool's queue
 */

void *check_nodes_work(

  void *vp)

  {
  work_task        *ptask = (struct work_task *)vp;

  struct pbsnode   *np = NULL;
  long              chk_len = 300;
  char              log_buf[LOCAL_LOG_BUF_SIZE];
  time_t            time_now = time(NULL);

  node_iterator     iter;
  
  /* load min refresh interval */
  get_svr_attr_l(SRV_ATR_check_rate, &chk_len);

  if (LOGLEVEL >= 5)
    {
    sprintf(log_buf, "verifying nodes are active (min_refresh = %d seconds)", (int)chk_len);

    log_event(PBSEVENT_ADMIN, PBS_EVENTCLASS_SERVER, __func__, log_buf);
    }

  /* evaluate all nodes */
  reinitialize_node_iterator(&iter);

  while ((np = next_node(&allnodes,np,&iter)) != NULL)
    {
    if (!(np->nd_state & INUSE_NOT_READY))
      {
      if (np->nd_lastupdate < (time_now - chk_len)) 
        {
        if (LOGLEVEL >= 6)
          {
          sprintf(log_buf, "node %s not detected in %ld seconds, contacting mom",
          np->nd_name,
          (long int)(time_now - np->nd_lastupdate));
          
          log_event(PBSEVENT_ADMIN, PBS_EVENTCLASS_SERVER, __func__, log_buf);
          }
          
        if (LOGLEVEL >= 0)
          {
          sprintf(log_buf, "node %s not detected in %ld seconds, marking node down",
            np->nd_name,
            (long int)(time_now - np->nd_lastupdate));
          
          log_event(PBSEVENT_ADMIN, PBS_EVENTCLASS_SERVER, __func__, log_buf);
          }
        
        update_node_state(np, (INUSE_DOWN));    
          
        /* The node is up. Do not mark the node down, but schedule a check_nodes */
        }
      }
    } /* END for each node */

  if (iter.node_index != NULL)
    delete iter.node_index;

  if (iter.alps_index != NULL)
    delete iter.alps_index;

  if (ptask->wt_parm1 == NULL)
    {
    set_task(WORK_Timed, time_now + chk_len, check_nodes, (char *)NULL,FALSE);
    }

  /* since this is done via threading, we now free the task here */
  free(ptask->wt_mutex);
  free(ptask);

  return(NULL);
  } /* check_nodes_work() */




/*
 * Mark any nodes that haven't checked in as down.
 * If the node isn't down then it checks to see that the
 * last update hasn't been too long ago.
 */

void check_nodes(

  struct work_task *ptask)  /* I (modified) */

  {
  int rc = enqueue_threadpool_request(check_nodes_work, ptask, task_pool);

  if (rc)
    {
    free(ptask->wt_mutex);
    free(ptask);
    log_err(rc, __func__, "Unable to enqueue check nodes task into the threadpool");
    }

  }  /* END check_nodes() */



void *write_node_state_work(

  void *vp)

  {
  struct pbsnode *np = NULL;
  static char    *fmt = (char *)"%s %d\n";
  static FILE    *nstatef = NULL;
  long            cray_enabled = FALSE;
  int             savemask;

  get_svr_attr_l(SRV_ATR_CrayEnabled, &cray_enabled);
  pthread_mutex_lock(node_state_mutex);

  if (LOGLEVEL >= 5)
    {
    DBPRT(("write_node_state_work: entered\n"))
    }

  /* don't store volatile states like down and unknown */

  savemask = INUSE_OFFLINE | INUSE_RESERVE;

  if (nstatef != NULL)
    {
    fseek(nstatef, 0L, SEEK_SET); /* rewind and clear */

    if (ftruncate(fileno(nstatef), (off_t)0) != 0)
      {
      log_err(errno, __func__, "could not truncate file");

      pthread_mutex_unlock(node_state_mutex);
      
      return(NULL);
      }
    }
  else
    {
    /* need to open for first time, temporary-move to pbsd_init */

    if ((nstatef = fopen(path_nodestate, "w+")) == NULL)
      {
      log_err(errno, __func__, "could not open file");

      pthread_mutex_unlock(node_state_mutex);
      
      return(NULL);
      }
    }

  /*
  ** The only state that carries forward is if the
  ** node has been marked offline.
  */
  if (cray_enabled == TRUE)
    {
    node_iterator   iter;
    reinitialize_node_iterator(&iter);

    while ((np = next_node(&allnodes, np, &iter)) != NULL)
      {
      if (np->nd_state & INUSE_OFFLINE)
        {
        fprintf(nstatef, fmt, np->nd_name, np->nd_state & savemask);
        }

      unlock_node(np, __func__, NULL, LOGLEVEL);
      } /* END for each node */
    }
  else
    {
    all_nodes_iterator *iter = NULL;

    while ((np = next_host(&allnodes,&iter,NULL)) != NULL)
      {
      if (np->nd_state & INUSE_OFFLINE)
        {
        fprintf(nstatef, fmt, np->nd_name, np->nd_state & savemask);
        }

      unlock_node(np, __func__, NULL, LOGLEVEL);
      } /* END for each node */

    if (iter != NULL)
      delete iter;
    }

  if (fflush(nstatef) != 0)
    {
    log_err(errno, __func__, "failed saving node state to disk");
    }

  fclose(nstatef);
  nstatef = NULL;

  pthread_mutex_unlock(node_state_mutex);

  return(NULL);
  } /* END write_node_state_work() */



void *write_node_power_state_work(

  void *vp)

  {
  struct pbsnode *np;
  static char    *fmt = (char *)"%s %d\n";
  static FILE    *nstatef = NULL;
  all_nodes_iterator *iter = NULL;

  pthread_mutex_lock(node_state_mutex);

  if (LOGLEVEL >= 5)
    {
    DBPRT(("write_node_power_state_work: entered\n"))
    }

  /* don't store running state */

  if (nstatef != NULL)
    {
    fseek(nstatef, 0L, SEEK_SET); /* rewind and clear */

    if (ftruncate(fileno(nstatef), (off_t)0) != 0)
      {
      log_err(errno, __func__, "could not truncate file");

      pthread_mutex_unlock(node_state_mutex);

      return(NULL);
      }
    }
  else
    {
    /* need to open for first time, temporary-move to pbsd_init */

    if ((nstatef = fopen(path_nodepowerstate, "w+")) == NULL)
      {
      log_err(errno, __func__, "could not open file");

      pthread_mutex_unlock(node_state_mutex);

      return(NULL);
      }
    }

  /*
  ** The only state that carries forward is if the
  ** node has been marked offline.
  */

  while ((np = next_host(&allnodes,&iter,NULL)) != NULL)
    {
    if (np->nd_power_state != POWER_STATE_RUNNING)
      {
      fprintf(nstatef, fmt, np->nd_name, np->nd_power_state);
      }

    unlock_node(np, __func__, NULL, LOGLEVEL);
    } /* END for each node */

  if (iter != NULL)
    delete iter;

  if (fflush(nstatef) != 0)
    {
    log_err(errno, __func__, "failed saving node state to disk");
    }

  fclose(nstatef);
  nstatef = NULL;

  pthread_mutex_unlock(node_state_mutex);

  return(NULL);
  } /* END write_node_power_state_work() */



void write_node_state(void)

  {
  int rc = enqueue_threadpool_request(write_node_state_work, NULL, task_pool);

  if (rc)
    {
    log_err(rc, __func__, "Unable to enqueue write_node_state_work task into the threadpool");
    }
  }  /* END write_node_state() */

void write_node_power_state(void)

  {
  int rc = enqueue_threadpool_request(write_node_power_state_work,NULL,task_pool);

  if (rc)
    {
    log_err(rc, __func__, "Unable to enqueue write_node_power_state_work task into the threadpool");
    }
  }  /* END write_node_power_state() */


/* Create a new node_note file then overwrite the previous one.
 *
 */
int write_node_note(void)

  {
  struct pbsnode *np = NULL;
  node_iterator   iter;
  FILE           *nin;

  if (LOGLEVEL >= 2)
    {
    DBPRT(("%s: entered\n", __func__))
    }

  if ((nin = fopen(path_nodenote_new, "w")) == NULL)
    goto err1;

  if (svr_totnodes == 0)
    {
    log_event(
      PBSEVENT_ADMIN, PBS_EVENTCLASS_SERVER, __func__, "Server has empty nodes list");

    fclose(nin);

    return(-1);
    }
  
  reinitialize_node_iterator(&iter);

  /* for each node ... */
  while ((np = next_node(&allnodes, np, &iter)) != NULL)
    {
    /* write node name followed by its note string */
    if ((np->nd_note != NULL) && 
        (np->nd_note[0] != '\0'))
      {
      fprintf(nin, "%s %s\n", np->nd_name, np->nd_note);
      }
    
    unlock_node(np, __func__, NULL, LOGLEVEL);
    }

  if (iter.node_index != NULL)
    delete iter.node_index;

  fflush(nin);

  if (ferror(nin))
    {
    fclose(nin);
    goto err1;
    }

  fclose(nin);

  if (rename(path_nodenote_new, path_nodenote) != 0)
    {
    log_event(PBSEVENT_ADMIN, PBS_EVENTCLASS_SERVER, __func__,
      (char *)"replacing old node note file failed");

    return(-1);
    }

  return(PBSE_NONE);

err1:
  log_event(PBSEVENT_ADMIN, PBS_EVENTCLASS_SERVER, __func__,
    (char *)"Node note file update failed");

  return(-1);
  }  /* END write_node_note() */



/*
 * free_prop - free list of prop structures created by proplist()
 */

static void free_prop(

  struct prop *prop)

  {
  struct prop *pp;

  for (pp = prop;pp != NULL;pp = prop)
    {
    prop = pp->next;

    free(pp->name);
    free(pp);
    }  /* END for (pp) */

  return;
  }    /* END free_prop() */




void *node_unreserve_work(

  void *vp)

  {
  resource_t       handle = *((resource_t *)vp);

  struct  pbsnode *np;
  all_nodes_iterator *iter = NULL;

  /* clear old reserve */
  while ((np = next_host(&allnodes,&iter,NULL)) != NULL)
    {
    if (handle == RESOURCE_T_ALL)
      np->nd_np_to_be_used = 0;

    unlock_node(np, "node_unreserve_work", NULL, LOGLEVEL);
    }

  if (iter != NULL)
    delete iter;

  return(NULL);
  } /* END node_unreserve_work() */





/*
 * unreserve - unreserve nodes
 *
 * If handle is set to a existing resource_t, then release all nodes
 * associated with that handle, otherwise, (this is dangerous)
 * if handle == RESOURCE_T_ALL, release all nodes period.
 */

void node_unreserve(

  resource_t handle)

  {
  int rc = enqueue_threadpool_request(node_unreserve_work, NULL, task_pool);

  if (rc)
    {
    log_err(rc, __func__, "Unable to enqueue node_unreserve task into the threadpool");
    }
  }  /* END node_unreserve() */




/*
** Look through the property list and make sure that all
** those marked are contained in the node.
*/

int hasprop(

  struct pbsnode *pnode,
  struct prop    *props)

  {
  struct  prop    *need;

  for (need = props; need != NULL; need = need->next)
    {

    struct prop *pp;

    if (need->mark == 0) /* not marked, skip */
      continue;

    for (pp = pnode->nd_first;pp != NULL;pp = pp->next)
      {
      if (strcmp(pp->name, need->name) == 0)
        break;  /* found it */
      }

    if (pp == NULL)
      {
      return(0);
      }
    }

  return(1);
  }  /* END hasprop() */





/*
 * see if node has the number of processors required
 * if free == SKIP_NONE,  check against total number of processors, else
 * if free != SKIP_NONE,  check against number free
 *
 * Return 1 if possible, 0 if not
 */

static int hasppn(

  struct pbsnode *pnode,     /* I */
  int             node_req,  /* I */
  int             free)      /* I */

  {
  if ((free != SKIP_NONE) &&
      (free != SKIP_NONE_REUSE) &&
      (pnode->nd_slots.get_number_free() >= node_req))
    {
    return(1);
    }

  if ((free == SKIP_NONE) && 
      (pnode->nd_slots.get_total_execution_slots() >= node_req))
    {
    return(1);
    }

  return(0);
  }  /* END hasppn() */




/*
 * gpu_count()
 *
 * Get a gpu count for this node
 * if freeonly is true, then count the free gpus; if false, then return the number
 * of gpus.
 *
 * @param pnode - the node whose gpus we're counting
 * @param freeonly - specifies if we want a total or just the available gpus
 * @return the total gpus if freeonly is false, else the number of free gpus
 */

int gpu_count(

  pbsnode *pnode,    /* I */
  bool     freeonly) /* I */

  {
  int  count = 0;
  char log_buf[LOCAL_LOG_BUF_SIZE];

  if ((pnode->nd_state & INUSE_OFFLINE) ||
      (pnode->nd_state & INUSE_UNKNOWN) ||
      (pnode->nd_state & INUSE_NOT_READY)||
      (pnode->nd_power_state != POWER_STATE_RUNNING))
    {
    if (LOGLEVEL >= 7)
      {
      sprintf(log_buf,
        "Counted %d gpus %s on node %s that was skipped",
        count,
        (freeonly? "free" : "total"),
        pnode->nd_name);
    
      log_ext(-1, __func__, log_buf, LOG_DEBUG);
      }
    return (count);
    }

  if (freeonly == false)
    count = pnode->nd_ngpus;
  else
    count = pnode->nd_ngpus_free;

  if (LOGLEVEL >= 7)
    {
    sprintf(log_buf,
      "Counted %d gpus %s on node %s",
      count,
      (freeonly? "free" : "total"),
      pnode->nd_name);

    log_ext(-1, __func__, log_buf, LOG_DEBUG);
    }

  return(count);
  }  /* END gpu_count() */





/*
** get gpu index for this gpuid
*/
int gpu_entry_by_id(

  struct pbsnode *pnode,  /* I */
  const char     *gpuid,
  int             get_empty)

  {
  if (pnode->nd_gpus_real)
    {
    int j;

    for (j = 0; j < pnode->nd_ngpus; j++)
      {
      struct gpusubn *gn = pnode->nd_gpusn + j;

      if ((gn->gpuid != NULL) && (strcmp(gpuid, gn->gpuid) == 0))
        {
        return(j);
        }
      }
    }

  /*
   * we did not find the entry.  if get_empty is set then look for an empty
   * slot.  If none is found then try to add a new entry to nd_gpusn
   */

  if (get_empty)
    {
    int j;

    for (j = 0; j < pnode->nd_ngpus; j++)
      {
      struct gpusubn *gn = pnode->nd_gpusn + j;

      if (gn->gpuid == NULL)
        {
        return(j);
        }
      }

    create_a_gpusubnode(pnode);
    return (pnode->nd_ngpus - 1);    
    }

  return (-1);
  }  /* END gpu_entry_by_id() */




/*
** Parse a number in a spec.
** Return 0 if okay, 1 if no number exists, -1 on error
*/

static int number(

  char **ptr,
  int   *num)

  {
  char  holder[80];
  int   i = 0;
  char *str = *ptr;
  char  log_buf[LOCAL_LOG_BUF_SIZE];

  while (isdigit(*str) && (unsigned int)(i + 1) < sizeof holder)
    holder[i++] = *str++;

  if (i == 0)
    {
    return(1);
    }

  holder[i] = '\0';

  if ((i = atoi(holder)) <= 0)
    {
    sprintf(log_buf, "zero illegal");

    return(-1);
    }

  *ptr = str;

  *num = i;

  return(0);
  }  /* END number() */




/*
** Check string to see if it is a legal property name.
** If not, return 1.
** *prop set to static char array containing the properity,
** must be copied.
*/

static int property(

  char **ptr,
  char **prop)

  {
  char        *str = *ptr;
  char        *dest = *prop;
  int          i = 0;
  char         log_buf[LOCAL_LOG_BUF_SIZE];
  long         cray_enabled = FALSE;

  get_svr_attr_l(SRV_ATR_CrayEnabled, &cray_enabled);

  if (!isalpha(*str))
    {
    if ((cray_enabled == FALSE) ||
        (is_compute_node(str) == FALSE))
      {
      sprintf(log_buf, "first character of property (%s) not a letter", str);
      
      return(1);
      }
    }

  while (isalnum(*str) || *str == '-' || *str == '.' || *str == '=' || *str == '_')
    dest[i++] = *str++;

  dest[i] = '\0';

  /* skip over "/vp_number" */

  if (*str == '/')
    {
    do
      {
      str++;
      }
    while (isdigit(*str));
    }

  *ptr = str;

  return(0);
  }  /* END property() */





/*
** Create a property list from a string.
** Return 0 if all is well, 1 otherwise.
*/

int proplist(

  char        **str,
  struct prop **plist,
  int          *node_req,
  int          *gpu_req,
  int          *mic_req)

  {
  struct prop *pp;
  char         name_storage[80];
  char        *pname;
  char        *pequal;
  bool         have_gpus = false;
  char         log_buf[LOCAL_LOG_BUF_SIZE];

  *node_req = 1; /* default to 1 processor per node */

  pname  = name_storage;
  *pname = '\0';

  for (;;)
    {
    if (property(str, &pname))
      {
      return(1);
      }

    if (*pname == '\0')
      break;

    if ((pequal = strchr(pname, (int)'=')) != NULL)
      {
      /* special property */

      /* identify the special property and place its value */
      /* into node_req       */

      *pequal = '\0';

      if (strcmp(pname, "ppn") == 0)
        {
        pequal++;

        if ((number(&pequal, node_req) != 0) || (*pequal != '\0'))
          {
          return(1);
          }
        }
      else if (strcmp(pname, "mics") == 0)
        {
        pequal++;

        if ((number(&pequal, mic_req) != PBSE_NONE) ||
            (*pequal != '\0'))
          {
          return(1);
          }
        }
      else if (strcmp(pname, "gpus") == 0)
        {
        pequal++;

        if ((number(&pequal, gpu_req) != 0) || (*pequal != '\0'))
          {
          return(1);
          }

        have_gpus = true;
        gpu_err_reset = FALSE; /* default to no */

        // default value if no other gets specified
        char *deflt_gpu_mode = NULL;

        get_svr_attr_str(SRV_ATR_DefaultGpuMode, &deflt_gpu_mode);

        if (deflt_gpu_mode != NULL)
          gpu_mode_rqstd = default_gpu_mode;
        else
          {
#if defined(NVML_API_VERSION) && (NVML_API_VERSION >= 8)
          // exclusive thread mode deprecated starting in version 8
          //  so use exlusive process instead
          gpu_mode_rqstd = gpu_exclusive_process;
#else
          gpu_mode_rqstd = gpu_exclusive_thread;
#endif
          }
        }
      else
        {
        return(1); /* not recognized - error */
        }
      }
    else if (have_gpus && (!strcasecmp(pname, "exclusive_thread")))
      {
      gpu_mode_rqstd = gpu_exclusive_thread;
      }
    else if (have_gpus && (!strcasecmp(pname, "exclusive")))
      {
      gpu_mode_rqstd = gpu_exclusive_thread;
      }
    else if (have_gpus && (!strcasecmp(pname, "exclusive_process")))
      {
      gpu_mode_rqstd = gpu_exclusive_process;
      }
    else if (have_gpus && (!strcasecmp(pname, "default")))
      {
      gpu_mode_rqstd = gpu_normal;
      }
    else if (have_gpus && (!strcasecmp(pname, "shared")))
      {
      gpu_mode_rqstd = gpu_normal;
      }
    else if (have_gpus && (!strcasecmp(pname, "reseterr")))
      {
      gpu_err_reset = TRUE;
      }
    else if ((have_gpus) && 
             (!strcasecmp(pname, "prohibited")))
      {
      // Do not allow users to request prohibited mode
      throw (int)PBSE_GPU_PROHIBITED_MODE;

      // NOT REACHED
      }
    else
      {
      pp = (struct prop *)calloc(1, sizeof(struct prop));

      pp->mark = 1;
      pp->name = strdup(pname);
      pp->next = *plist;

      *plist = pp;
      }

    if ((have_gpus) && (LOGLEVEL >= 7))
      {
      sprintf(log_buf,
        "proplist: set needed gpu mode to %d",
        gpu_mode_rqstd);

       log_ext(-1, __func__, log_buf, LOG_DEBUG);
      }

    if (**str != ':')
      break;

    (*str)++;
    }  /* END for(;;) */

  return(PBSE_NONE);
  }  /* END proplist() */



/*
** Add the "global" spec to every sub-spec in "spec".
**      RETURNS:  allocated string buffer (must be freed externally)
*/

static char *mod_spec(

  char *spec,    /* I */
  char *global)  /* I */

  {
  char  *line;
  char  *cp;
  int    len;
  int    nsubspec;

  nsubspec = 1;

  for (cp = spec;*cp != '\0';cp++)
    {
    if (*cp == '+')
      {
      nsubspec++;
      }
    }

  len = strlen(global);

  line = (char *)calloc(1, nsubspec * (len + 1) + strlen(spec) + 1);

  if (line == NULL)
    {
    /* FAILURE */

    return(NULL);
    }

  cp = line;

  while (*spec)
    {
    if (*spec == '+')
      {
      *cp++ = ':';

      strcpy(cp, global);

      cp += len;
      }

    *cp++ = *spec++;
    }

  *cp++ = ':';

  strcpy(cp, global);

  return(line);
  }  /* END mod_spec() */



/*
 * Test a procs specification.
 *
 * Return >0 - number of procs counted in the spec if it works,
 *         0 - if it cannot be satisfied now,
 *        -1 - if it can never be satisfied.
 *
 */
int procs_available(
    
  int proc_ct)

  {
  all_nodes_iterator *iter = NULL;
  int             procs_avail = 0;
  struct pbsnode *pnode;

  if (proc_ct > svr_clnodes)
    {
    /* user requested more processors than are available on the system*/
    return(-1);
    }

  while ((pnode = next_host(&allnodes,&iter,NULL)) != NULL)
    {
    procs_avail += pnode->nd_slots.get_number_free();

    unlock_node(pnode, "procs_available", NULL, LOGLEVEL);
    }

  if (iter != NULL)
    delete iter;

  if (proc_ct > procs_avail)
    {
    return(0);
    }

  return(procs_avail);
  } /* END procs_available() */




bool node_is_spec_acceptable(

  struct pbsnode   *pnode,
  single_spec_data *spec,
  char             *ProcBMStr,
  int              *eligible_nodes,
  bool              job_is_exclusive)

  {
  struct prop    *prop = spec->prop;

  int             ppn_req = spec->ppn;
  int             gpu_req = spec->gpu;
  int             mic_req = spec->mic;
  int             gpu_free;
  int             np_free;
  int             mic_free;

#ifdef GEOMETRY_REQUESTS
  if (IS_VALID_STR(ProcBMStr))
    {
    if ((pnode->nd_state != INUSE_FREE)||(pnode->nd_power_state != POWER_STATE_RUNNING))
      return(false);

    if (node_satisfies_request(pnode, ProcBMStr) == FALSE)
      return(false);
    }
#endif

  /* NYI: check if these are necessary */
  pnode->nd_flag = okay;

  /* make sure that the node has properties */
  if (hasprop(pnode, prop) == FALSE)
    return(false);

  if ((hasppn(pnode, ppn_req, SKIP_NONE) == FALSE) ||
      (gpu_count(pnode, false) < gpu_req) ||
      (pnode->nd_nmics < mic_req))
    return(false);

  (*eligible_nodes)++;

  if (((pnode->nd_state & (INUSE_OFFLINE | INUSE_NOT_READY | INUSE_RESERVE | INUSE_JOB)) != 0)||(pnode->nd_power_state != POWER_STATE_RUNNING))
    return(false);

  gpu_free = gpu_count(pnode, true) - pnode->nd_ngpus_to_be_used;
  np_free  = pnode->nd_slots.get_number_free() - pnode->nd_np_to_be_used;
  mic_free = pnode->nd_nmics_free - pnode->nd_nmics_to_be_used;
  
  if ((ppn_req > np_free) ||
      (gpu_req > gpu_free) ||
      (mic_req > mic_free))
    return(false);

  if (job_is_exclusive)
    {
    if(pnode->nd_slots.get_number_free() != pnode->nd_slots.get_total_execution_slots())
      {
      return false;
      }
    }

  return(true);
  } /* END node_is_spec_acceptable() */



int parse_req_data(
    
  complete_spec_data *all_reqs)

  {
  int               i;
  int               j = 0;
  long              cray_enabled = FALSE;

  single_spec_data *req;

  get_svr_attr_l(SRV_ATR_CrayEnabled, &cray_enabled);
  all_reqs->total_nodes = 0;

  for (i = 0; i < all_reqs->num_reqs; i++)
    {
    req = all_reqs->reqs + i;
    req->nodes = 1;
    req->gpu   = 0;
    req->ppn   = 1;
    req->prop  = NULL;

    if ((cray_enabled == FALSE) ||
        (is_compute_node(all_reqs->req_start[i]) == FALSE))
      {
      if ((j = number(&(all_reqs->req_start[i]), &(req->nodes))) == -1)
        return(j);
      }

    if (j == 0)
      {
      /* there was a number */
      if (*(all_reqs->req_start[i]) != '\0')
        {
        if (*(all_reqs->req_start[i]) == ':')
          all_reqs->req_start[i]++;
        
        if (proplist(&(all_reqs->req_start[i]), &(req->prop), &(req->ppn), &(req->gpu), &(req->mic)))
          return(-1);
        }
      }
    else
      {
      if (*(all_reqs->req_start[i]) != '\0')
        {
        if (proplist(&(all_reqs->req_start[i]), &(req->prop), &(req->ppn), &(req->gpu), &(req->mic)))
          return(-1);
        }
      }

    all_reqs->total_nodes += req->nodes;
    }

  return(PBSE_NONE);
  } /* END parse_req_data() */




/* 
 * builds the node_job_add_info struct that will be used by set_nodes
 * instead of looping over different nodes.
 */

int save_node_for_adding(
    
  std::list<node_job_add_info> *naji_list,
  struct pbsnode               *pnode,
  single_spec_data             *req,
  int                           first_node_id,
  int                           is_external_node,
  int                           req_rank)

  {
  std::list<node_job_add_info>::iterator it;
  node_job_add_info  naji;
  bool               first = false;
  bool               added = false;
  int                pnode_id = pnode->nd_id;
    
  /* initialize */
  naji.node_id = pnode_id;
  naji.ppn_needed = req->ppn;
  naji.gpu_needed = req->gpu;
  naji.mic_needed = req->mic;
  naji.is_external = is_external_node;
  naji.req_index = req->req_index;

  if ((first_node_id == pnode_id) ||
      (first_node_id == -1))
    {
    pnode->nd_order = 0;
    first = true;

    if (req_rank > 0)
      req_rank = 0;
    }
  else
    pnode->nd_order = 1;
  
  naji.req_index = req_rank;

  if ((naji_list->size() == 0) ||
      (first == true))
    {
    // first
    if (first == true)
      naji.req_order = 0;
    else
      naji.req_order = req_rank + 1;

    naji_list->push_front(naji);
    }
  else
    {
    naji.req_order = req_rank + 1;

    // Insert into the list in rank order
    for (it = naji_list->begin(); it != naji_list->end(); it++)
      {
      if (naji.req_order < it->req_order)
        {
        naji_list->insert(it, naji);
        added = true;
        break;
        }
      }

    if (added == false)
      naji_list->push_back(naji);
    }

  /* count off the number we have reserved */
  pnode->nd_np_to_be_used    += req->ppn;
  pnode->nd_ngpus_to_be_used += req->gpu;
  pnode->nd_nmics_to_be_used += req->mic;

  return(PBSE_NONE);
  } /* END save_node_for_adding */




/*
 * if there is a node being requested, the spec should look like
 * node_name[:ppn=X][+]...
 * otherwise it should look like:
 * <NUM_NODES>[:ppn=X][+]...
 *
 * If a specific node is being requested first, copy just the
 * name into first_node_name  
 */

void set_first_node_name(
    
  char *spec_param,      /* I */
  char *first_node_name) /* O */

  {
  int   i;
  int   len;

  if ((isdigit(spec_param[0]) == TRUE) ||
      (!strcmp(spec_param, RESOURCE_20_FIND)))
    {
    first_node_name[0] = '\0';
    }
  else
    {
    len = strlen(spec_param);
    
    for (i = 0; i < len; i++)
      {
      /* a ':' means you've moved on to ppn and a + means its the next req */
      if ((spec_param[i] == ':') ||
          (spec_param[i] == '+') ||
          (spec_param[i] == '|'))
        break;
      else
        first_node_name[i] = spec_param[i];
      }
    
    /* make sure you NULL terminate */
    first_node_name[i] = '\0';
    }

  } /* END set_first_node_name() */




int is_reserved_property(

  char *prop)

  {
  if ((strncmp(prop, "ppn", strlen("ppn")) == 0) ||
      (strncmp(prop, "gpus", strlen("gpus") == 0)) ||
      (strncasecmp(prop, "exclusive_thread", strlen("exclusive_thread")) == 0) ||
      (strncasecmp(prop, "exclusive", strlen("exclusive")) == 0) ||
      (strncasecmp(prop, "exclusive_process", strlen("exclusive_process")) == 0) ||
      (strncasecmp(prop, "default", strlen("default")) == 0) ||
      (strncasecmp(prop, "shared", strlen("shared")) == 0) ||
      (strncasecmp(prop, "reseterr", strlen("reseterr")) == 0))
    return(TRUE);
  else
    return(FALSE);
  } /* END is_reserved_property() */




int is_compute_node(

  char *node_id)

  {
  struct pbsnode *pnode;
  int             rc = FALSE;
  char           *colon;
  char           *plus;

  if ((colon = strchr(node_id, ':')) != NULL)
    {
    if ((!strcmp(colon + 1, "external")) ||
        (!strcmp(colon + 1, alps_reporter_feature)) || 
        (!strcmp(colon + 1, alps_starter_feature)))
      {
      return(rc);
      }
    else
      *colon = '\0';
    }
  
  if ((plus = strchr(node_id, '+')) != NULL)
    *plus = '\0';

  if ((pnode = find_nodebyname(node_id)) != NULL)
    {
    rc = TRUE;
    unlock_node(pnode, __func__, NULL, LOGLEVEL);
    }

  if (colon != NULL)
    *colon = ':';

  if (plus != NULL)
    *plus = '+';

  return(rc);
  } /* END is_compute_node() */




void release_node_allocation(
    
  std::list<node_job_add_info> *naji_list)

  {
  pbsnode                                *pnode = NULL;
  std::list<node_job_add_info>::iterator  it;

  for (it = naji_list->begin(); it != naji_list->end(); it++)
    {
    if ((pnode = find_nodebyid(it->node_id)) != NULL)
      {
      pnode->nd_np_to_be_used    -= it->ppn_needed;
      pnode->nd_ngpus_to_be_used -= it->gpu_needed;
      pnode->nd_nmics_to_be_used -= it->mic_needed;
      unlock_node(pnode, __func__, NULL, LOGLEVEL);
      }
    }
  } /* END release_node_allocation() */




int check_for_node_type(

  complete_spec_data *all_reqs,
  enum node_types     nt)

  {
  single_spec_data *req;
  int               i;
  int               found_type = FALSE;
  struct pbsnode   *pnode;
  struct pbsnode   *reporter = alps_reporter;
  struct prop      *p;

  if (reporter == NULL)
    {
    /* this shouldn't be possible */
    log_err(-1, __func__, "Checking for node types with a non-cray enabled pbs_server??");
    return(-1);
    }


  for (i = 0; i < all_reqs->num_reqs; i++)
    {
    req = all_reqs->reqs + i;

    for (p = req->prop; p != NULL; p = p->next)
      {
      if ((!strcmp(p->name, "cray_compute")) ||
          (!strcmp(p->name, alps_starter_feature)))
        continue;

      lock_node(reporter, __func__, NULL, LOGLEVEL);
      pnode = find_node_in_allnodes(reporter->alps_subnodes, p->name);
      unlock_node(reporter, __func__, NULL, LOGLEVEL);

      if (pnode != NULL)
        {
        unlock_node(pnode, __func__, NULL, LOGLEVEL);

        if (nt == ND_TYPE_CRAY)
          {
          found_type = TRUE;
  
          break;
          }
        }
      else if (nt != ND_TYPE_CRAY)
        {
        int login = FALSE;

        pnode = find_nodebyname(p->name);

        if (pnode != NULL)
          {
          if (pnode->nd_is_alps_login == TRUE)
            login = TRUE;

          unlock_node(pnode, __func__, NULL, LOGLEVEL);

          if (nt == ND_TYPE_EXTERNAL)
            {
            if (login == FALSE)
              found_type = TRUE;
            }
          else if (nt == ND_TYPE_LOGIN)
            if (login == TRUE)
              found_type = TRUE;

          break;
          }
        }
      }

    if (found_type == TRUE)
      break;
    }

  return(found_type);
  } /* END check_for_node_type() */




enum job_types find_job_type(

  complete_spec_data *all_reqs)

  {
  enum job_types jt = JOB_TYPE_cray;
  
  if (check_for_node_type(all_reqs, ND_TYPE_CRAY) == TRUE)
    {
    if (check_for_node_type(all_reqs, ND_TYPE_EXTERNAL) == TRUE)
      jt = JOB_TYPE_heterogeneous;
    else
      jt = JOB_TYPE_cray;
    }
  else if (check_for_node_type(all_reqs, ND_TYPE_EXTERNAL) == TRUE)
    {
    jt = JOB_TYPE_normal;
    }
  else if (check_for_node_type(all_reqs, ND_TYPE_LOGIN) == TRUE)
    jt = JOB_TYPE_login;

  return(jt);
  } /* END find_job_type() */




int add_login_node_if_needed(

  int                          &first_node_id,
  char                         *login_prop,
  std::list<node_job_add_info> *naji_list)

  {
  struct pbsnode   *login = find_nodebyid(first_node_id);
  bool              need_to_add_login = false;
  int               rc = PBSE_NONE;
  int               dummy1;
  int               dummy2;
  int               dummy3;
  struct prop      *prop = NULL;
  single_spec_data  req;

  if (login == NULL)
    need_to_add_login = true;
  else
    {
    if (login->nd_is_alps_login == FALSE)
      need_to_add_login = true;

    unlock_node(login, __func__, NULL, LOGLEVEL);
    }

  if (need_to_add_login == true)
    {
    if (login_prop != NULL)
      {
      if (proplist(&login_prop, &prop, &dummy1, &dummy2, &dummy3) != PBSE_NONE)
        {
        if (LOGLEVEL >= 3)
          {
          char log_buf[LOCAL_LOG_BUF_SIZE];
          snprintf(log_buf, sizeof(log_buf), "Malformed property list '%s', continuing.", login_prop);
          log_err(-1, __func__, log_buf);
          }
        }
      }

    if ((login = get_next_login_node(prop)) == NULL)
      rc = -1;
    else
      {
      if (naji_list != NULL)
        {
        /* add to list */
        req.nodes = 1;
        req.ppn = 1;
        req.gpu = 0;
        req.mic = 0;
        req.prop = NULL;
        save_node_for_adding(naji_list, login, &req, login->nd_id, FALSE, -1);
        first_node_id = login->nd_id;
        }
      
      rc = PBSE_NONE;

      unlock_node(login, __func__, NULL, LOGLEVEL);
      }

    if (prop != NULL)
      free_prop(prop);
    }

  return(rc);
  } /* END add_login_node_if_needed() */




int node_is_external(

  struct pbsnode *pnode)

  {
  int is_external = FALSE;

  /* all logins have nd_is_alps_login set to true.
   * all cray computes have their parent pointer set to alps_reporter.
   * if neither of these are found, it must be an external node */
  if ((pnode->nd_is_alps_login == FALSE) &&
      (pnode->parent == NULL))
    is_external = TRUE;
  
  return(is_external);
  } /* END node_is_external() */



/*
 * record_fitting_node()
 * updates relevant structs to reflect that this node fits the specifications in req
 *
 * @pre-cond: pnode, req, and first_node_name must all be valid pointers to their types.
 * naji and ard_array are optional parameters
 * @post-cond: if naji is non-null then this node is added so it can be put in the exec_host
 * list later.
 * @post-cond: req's nodes required count is decremented
 * @post-cond: if ard_array is non-null then the node information is added there as well
 */

void record_fitting_node(

  int                           &num,
  struct pbsnode                *pnode,
  std::list<node_job_add_info>  *naji_list,       /* O (optional) */
  single_spec_data              *req,
  int                            first_node_id,
  int                            i,
  int                            num_alps_reqs,
  enum job_types                 job_type,
  complete_spec_data            *all_reqs,
  alps_req_data                **ard_array)       /* O (optional) */

  {
  // count the nodes that work
  num++;

  /* for heterogeneous jobs on the cray, record the external 
   * nodes in a separate attribute */
  if (naji_list != NULL)
    {
    if ((job_type == JOB_TYPE_heterogeneous) &&
        (node_is_external(pnode) == TRUE))
      save_node_for_adding(naji_list, pnode, req, first_node_id, TRUE, i);
    else
      save_node_for_adding(naji_list, pnode, req, first_node_id, FALSE, i);

    if ((num_alps_reqs > 0) &&
        (ard_array != NULL) &&
        (*ard_array != NULL))
      {
      if ((*ard_array)[req->req_id].node_list->length() != 0)
        (*ard_array)[req->req_id].node_list->append(",");

      (*ard_array)[req->req_id].node_list->append(pnode->nd_name);

      if (req->ppn > (*ard_array)[req->req_id].ppn)
        (*ard_array)[req->req_id].ppn = req->ppn;
      }
    }

  all_reqs->total_nodes--;
  req->nodes--;
  } /* END record_fitting_node */



/*
 * select_nodes_using_hostlist()
 *
 * This function assumes that the spec for this job run request comes in the form
 * host:ppn[+host2:ppn[...]] and simply finds each node in this list. This 
 * becomes O(N) with respect to the number of nodes in the job, whereas the 
 * traditional method is O(N) with respect to the number of nodes in the system.
 * In most cases, using this method will beome much faster.
 *
 * @pre-cond: all_reqs, naji, eligible_nodes, spec, and first_node_name must all
 * be valid pointers.
 * @post-cond: the nodes in the list are saved in naji to be added for the job later
 */

int select_nodes_using_hostlist(
    
  complete_spec_data            *all_reqs,        /* I */
  std::list<node_job_add_info>  *naji_list,       /* O */
  int                           *eligible_nodes,  /* O */
  const char                    *spec,            /* I */
  alps_req_data                **ard_array,       /* O (optional) */
  int                            first_node_id,   /* I */
  int                            num_alps_reqs,   /* I */
  enum job_types                 job_type,        /* I */
  char                          *ProcBMStr,       /* I (optional) */
  bool                           job_is_exclusive)

  {
  struct pbsnode      *pnode;
  char                 log_buf[LOCAL_LOG_BUF_SIZE];
  int                  num = 0;
  
  for (int i = 0; i < all_reqs->num_reqs; i++)
    {
    single_spec_data *req = all_reqs->reqs + i;

    // must have a property and name specified for each 
    if ((req->prop == NULL) ||
        (req->prop->name == NULL))
      {
      snprintf(log_buf, sizeof(log_buf), "Spec '%s' doesn't have a node name for all entries", spec);
      log_err(-1, __func__, log_buf);

      return(-1);
      }

    pnode = find_nodebyname(req->prop->name);

    // couldn't find the specified node 
    if (pnode == NULL)
      {
      snprintf(log_buf, sizeof(log_buf), "Node '%s' not found", req->prop->name);
      log_err(-1, __func__, log_buf);

      return(-2);
      }
    
    if (node_is_spec_acceptable(pnode, req, ProcBMStr, eligible_nodes, job_is_exclusive) == false)
      {
      snprintf(log_buf, sizeof(log_buf), "Requested node '%s' is not currently available", req->prop->name);
      log_err(-1, __func__, log_buf);
      unlock_node(pnode, __func__, NULL, LOGLEVEL);
      break;
      }
    
    record_fitting_node(num, pnode, naji_list, req, first_node_id, req->req_id, num_alps_reqs, job_type, all_reqs, ard_array);

    unlock_node(pnode, __func__, NULL, LOGLEVEL);
    }

  return(num);
  } /* END select_nodes_using_hostlist() */



/*
 * select_from_all_nodes()
 *
 * The traditional selecting algorithm. It iterates over every node that exists until finding the
 * node(s) that we are searching for. This is O(N) with respect to the number of nodes in the system
 * as each request is checked against each node at locking time.
 *
 * @pre-cond: all_reqs, eligible_nodes, and first_node_name must all be valid parameters
 * @post-cond: the nodes in the list are saved in naji to be added for the job later
 */

int select_from_all_nodes(

  complete_spec_data            *all_reqs,        /* I */
  std::list<node_job_add_info>  *naji_list,       /* O (optional) */
  int                           *eligible_nodes,  /* O */
  alps_req_data                **ard_array,       /* O (optional) */
  int                            first_node_id,   /* I */
  int                            num_alps_reqs,   /* I */
  enum job_types                 job_type,        /* I */
  char                          *ProcBMStr,       /* I (optional) */
  bool                           job_is_exclusive)

  {
  node_iterator   iter;
  struct pbsnode *pnode = NULL;
  int             num = 0;
  
  reinitialize_node_iterator(&iter);

  /* iterate over all nodes */
  while ((pnode = next_node(&allnodes,pnode,&iter)) != NULL)
    {
    /* check each req against this node to see if it satisfies it */
    for (int i = 0; i < all_reqs->num_reqs; i++)
      {
      single_spec_data *req = all_reqs->reqs + i;

      if (req->nodes > 0)
        {
        if (node_is_spec_acceptable(pnode, req, ProcBMStr, eligible_nodes,job_is_exclusive) == true)
          {
          record_fitting_node(num, pnode, naji_list, req, first_node_id, req->req_id, num_alps_reqs, job_type, all_reqs, ard_array);

          /* are all reqs satisfied? */
          if (all_reqs->total_nodes == 0)
            break;
          }
        }
      }

    /* are all reqs satisfied? */
    if (all_reqs->total_nodes == 0)
      {
      unlock_node(pnode, __func__, NULL, LOGLEVEL);
      break;
      }
    } /* END for each node */

  if (iter.node_index != NULL)
    delete iter.node_index;

  return(num);
  } /* select_from_all_nodes() */



/*
 * process_as_node_list()
 * Decides whether or not a spec is a list of hosts (host:ppn+host2:ppn...) or
 * a generic request (4:ppn=20). Returns true if it is a list of hosts, false otherwise.
 * If naji is null - indicating we aren't running a job, simply return false.
 *
 * @pre-cond: spec must be a valid char *
 * @return: true if naji is non-null and spec looks like a node list.
 */

bool process_as_node_list(

  const char                   *spec,
  std::list<node_job_add_info> *naji_list)

  {
  if (naji_list == NULL)
    return(false);

  if (spec == NULL)
    return(false);

  std::string nodes(spec);
  std::size_t pos = nodes.find("+");
  std::size_t bar_pos = nodes.find("|");
  std::string second_node;

  if (bar_pos < pos)
    pos = bar_pos;

  if (pos != std::string::npos)
    {
    second_node = nodes.substr(pos + 1);
    nodes.erase(pos);
    }
  else if (isalpha(spec[0]))
    return(true);

  pos = nodes.find(":");

  if (pos != std::string::npos)
    nodes.erase(pos);

  struct pbsnode *pnode = find_nodebyname(nodes.c_str());

  if (pnode != NULL)
    {
    unlock_node(pnode, __func__, NULL, LOGLEVEL);

    if (second_node.size() == 0)
      return(true);

    if ((pos = second_node.find("+")) != std::string::npos)
      second_node.erase(pos);

    if ((pos = second_node.find("|")) != std::string::npos)
      second_node.erase(pos);

    if ((pos = second_node.find(":")) != std::string::npos)
      second_node.erase(pos);

    pnode = find_nodebyname(second_node.c_str());

    if (pnode != NULL)
      {
      unlock_node(pnode, __func__, NULL, LOGLEVEL);
      return(true);
      }
    }
 
  return(false);
  } /* process_as_node_list() */


int initialize_alps_req_data(

  alps_req_data **ard_array,
  int             num_reqs)

  {
  alps_req_data *ard = (alps_req_data *)calloc(num_reqs, sizeof(alps_req_data));
  
  for (int i = 0; i < num_reqs; i++)
    ard[i].node_list = new std::string();

  *ard_array = ard;

  return(PBSE_NONE);
  } // initialize_alps_req_data()



/*
 * Test a node specification.
 *
 * Return >0 - number of nodes counted in the spec if it works,
 *         0 - if it cannot be satisfied,
 *        -1 - if it can never be satisfied.
 * Okay to bail early if "early" is true.
 * VPs selected are marked "thinking"
 */

int node_spec(

  char                          *spec_param, /* I */
  int                            early,      /* I (boolean) */
  int                            exactmatch, /* I (boolean) - NOT USED */
  char                          *ProcBMStr,  /* I */
  char                          *FailNode,   /* O (optional,minsize=1024) */
  std::list<node_job_add_info>  *naji_list,  /* O (optional) */
  char                          *EMsg,       /* O (optional,minsize=1024) */
  char                          *login_prop, /* I (optional) */
  alps_req_data                **ard_array,  /* O (optional) */
  int                           *num_reqs,   /* O (optional) */
  enum job_types                &job_type,
  bool                           job_is_exclusive) /* I If true job requires must be only one on node. */

  {
  FUNCTION_TIMER
  char                 first_node_name[PBS_MAXHOSTNAME + 1];
  int                  first_node_id;
  char                 log_buf[LOCAL_LOG_BUF_SIZE];

  char                *globs;
  char                *cp;
  char                *hold;
  int                  i;
  int                  req_index;
  int                  num;
  int                  rc;
  int                  eligible_nodes = 0;
  complete_spec_data   all_reqs;
  char                *spec;
  char                *plus;
  long                 cray_enabled = FALSE;
  int                  num_alps_reqs = 0;

  FUNCTION_TIMER

  if (EMsg != NULL)
    EMsg[0] = '\0';

  if (FailNode != NULL)
    FailNode[0] = '\0';

  if (LOGLEVEL >= 6)
    {
    sprintf(log_buf, "entered spec=%.4000s", spec_param);

    log_record(PBSEVENT_SCHED, PBS_EVENTCLASS_REQUEST, __func__, log_buf);

    DBPRT(("%s\n", log_buf));
    }
  
  job_type = JOB_TYPE_normal;

  set_first_node_name(spec_param, first_node_name);
  first_node_id = node_mapper.get_id(first_node_name);
  get_svr_attr_l(SRV_ATR_CrayEnabled, &cray_enabled);

  spec = strdup(spec_param);

  if (spec == NULL)
    {
    /* FAILURE */
    sprintf(log_buf, "cannot alloc memory");

    if (LOGLEVEL >= 1)
      {
      log_record(PBSEVENT_SCHED, PBS_EVENTCLASS_REQUEST, __func__, log_buf);
      }

    if (EMsg != NULL)
      {
      snprintf(EMsg, 1024, "%s", log_buf);
      }

    return(-1);
    }

  if ((globs = strchr(spec, '#')) != NULL)
    {
    *globs++ = '\0';

    globs = strdup(globs);

    while ((cp = strrchr(globs, '#')) != NULL)
      {
      *cp++ = '\0';

      hold = mod_spec(spec, cp);
      
      free(spec);
      
      spec = hold;
      }

    hold = mod_spec(spec, globs);
    
    free(spec);
    
    spec = hold;

    free(globs);
    }  /* END if ((globs = strchr(spec,'#')) != NULL) */

  all_reqs.num_reqs = 1;
  plus = spec;

  /* count number of reqs */
  while (*plus != '\0')
    {
    if ((*plus == '+') ||
        (*plus == '|'))
      all_reqs.num_reqs++;

    plus++;
    }

  /* allocate space in all_reqs */
  all_reqs.reqs      = (single_spec_data *)calloc(all_reqs.num_reqs, sizeof(single_spec_data));
  all_reqs.req_start = (char **)calloc(all_reqs.num_reqs, sizeof(char *));

  if ((all_reqs.reqs == NULL) ||
      (all_reqs.req_start == NULL))
    {
    if (all_reqs.reqs != NULL)
      free(all_reqs.reqs);
    else if (all_reqs.req_start != NULL)
      free(all_reqs.req_start);

    log_err(ENOMEM, __func__, "Cannot allocate memory!");
    free(spec);
    return(-1);
    }

  /* set up pointers for reqs */
  plus = spec;
  i = 0;
  req_index = 0;
  all_reqs.req_start[i] = spec;
  all_reqs.reqs[i].req_index = req_index;
  i++;

  while (*plus != '\0')
    {
    /* make the '+' NULL and advance past it */
    if (*plus == '|')
      num_alps_reqs++;

    if ((*plus == '|') ||
        (*plus == '+'))
      {
      if (*plus == '|')
        req_index++;

      all_reqs.reqs[i].req_id = num_alps_reqs;
      all_reqs.reqs[i].req_index = req_index;
      
      *plus = '\0';
      plus++;
      
      /* advance past "nodes=" */
      if (!strncmp(plus, "nodes=", strlen("nodes=")))
        plus += strlen("nodes=");
      
      all_reqs.req_start[i] = plus;
      i++;
      }
    else
      plus++;
    }

  /* now parse each spec into the data */
  if ((rc = parse_req_data(&all_reqs)) != PBSE_NONE)
    {
    /* FAILURE */
    for (i = 0; i < all_reqs.num_reqs; i++)
      free_prop(all_reqs.reqs[i].prop);
    
    free(all_reqs.reqs);
    free(all_reqs.req_start);

    free(spec);

    return(rc);
    }

  num = all_reqs.total_nodes;

#ifndef CRAY_MOAB_PASSTHRU
  /* If we restart pbs_server while the cray is down, pbs_server won't know about
   * the computes. Don't perform this check for this case. */
  if (alps_reporter != NULL)
    {
    alps_reporter->alps_subnodes->lock();
    }
  if ((cray_enabled != TRUE) || 
      (alps_reporter == NULL) ||
      (alps_reporter->alps_subnodes->count() != 0))
    {
    if(alps_reporter != NULL)
      {
      alps_reporter->alps_subnodes->unlock();
      }
    if (num > svr_clnodes)
      {
      /* FAILURE */

      free(spec);

      sprintf(log_buf, "job allocation request exceeds available cluster nodes, %d requested, %d available",
        num,
        svr_clnodes);

      if (LOGLEVEL >= 6)
        {
        log_record(PBSEVENT_SCHED, PBS_EVENTCLASS_REQUEST, __func__, log_buf);
        }

      if (EMsg != NULL)
        {
        snprintf(EMsg, 1024, "%s", log_buf);
        }

      return(-1);
      }
    }
  else if(alps_reporter != NULL)
    {
    alps_reporter->alps_subnodes->unlock();
    }

#endif

  if (LOGLEVEL >= 6)
    {
    sprintf(log_buf, "job allocation debug: %d requested, %d svr_clnodes, %d svr_totnodes",
      num,
      svr_clnodes,
      svr_totnodes);

    log_record(PBSEVENT_SCHED, PBS_EVENTCLASS_REQUEST, __func__, log_buf);

    DBPRT(("%s\n", log_buf));
    }

  if (cray_enabled == TRUE)
    {
    job_type = find_job_type(&all_reqs);

    if ((job_type == JOB_TYPE_cray) ||
        (job_type == JOB_TYPE_heterogeneous))
      {
      /* naji_list == NULL indicates that this is a qsub not a run command,
       * we only need to assign the login when the job is run */
      if (naji_list != NULL)
        {
        if (add_login_node_if_needed(first_node_id, login_prop, naji_list) != PBSE_NONE)
          {
          snprintf(log_buf, sizeof(log_buf), 
            "Couldn't find an acceptable login node for spec '%s' with feature request '%s'",
            spec_param,
            (login_prop != NULL) ? login_prop : "null");
          
          log_err(-1, __func__, log_buf);
          
          free(spec);
          
          return(PBSE_LOGIN_BUSY);
          }
        }
      }

    if ((num_alps_reqs > 0) &&
        (ard_array != NULL) &&
        (job_type == JOB_TYPE_cray))
      {
      *num_reqs = num_alps_reqs + 1;
      initialize_alps_req_data(ard_array, *num_reqs);
      }
    }

  if (process_as_node_list(spec_param, naji_list) == true)
    {
    select_nodes_using_hostlist(&all_reqs, naji_list, &eligible_nodes, spec, ard_array, first_node_id, num_alps_reqs, job_type, ProcBMStr,job_is_exclusive);
    }
  else
    select_from_all_nodes(&all_reqs, naji_list, &eligible_nodes, ard_array, first_node_id, num_alps_reqs, job_type, ProcBMStr,job_is_exclusive);

  for (i = 0; i < all_reqs.num_reqs; i++)
    if (all_reqs.reqs[i].prop != NULL)
      free_prop(all_reqs.reqs[i].prop);
  
  free(all_reqs.reqs);
  free(all_reqs.req_start);

  free(spec);

  /* If we restart pbs_server while the cray is down, pbs_server won't know about
   * the computes. Don't perform this check for this case. */
  if(alps_reporter != NULL)
    {
    alps_reporter->alps_subnodes->lock();
    }
  if ((cray_enabled != TRUE) || 
      (alps_reporter == NULL) ||
      (alps_reporter->alps_subnodes->count() != 0))
    {
    if(alps_reporter != NULL)
      {
      alps_reporter->alps_subnodes->unlock();
      }
#ifndef CRAY_MOAB_PASSTHRU
    if (eligible_nodes < num)
      {
      if ((SvrNodeCt == 0) || (SvrNodeCt < num))
        {
        /* sufficient eligible nodes do not exist */
        /* FAILURE */
        sprintf(log_buf,
          "job requesting nodes that will never be available - spec = %s",
          spec_param);

        log_err(-1, __func__, log_buf);
        if (naji_list != NULL)
          release_node_allocation(naji_list);

        return(-1);
        }
      }
#endif
    }
  else if(alps_reporter != NULL)
    {
    alps_reporter->alps_subnodes->unlock();
    }

  if (all_reqs.total_nodes > 0)
    {
    /* nodes not currently available */
    /* FAILURE */
    sprintf(log_buf,
      "job allocation request exceeds currently available cluster nodes, %d requested, %d available",
      num,
      num - all_reqs.total_nodes);

    log_record(PBSEVENT_SCHED, PBS_EVENTCLASS_REQUEST, __func__, log_buf);

    if (EMsg != NULL)
      {
      snprintf(EMsg, MAXLINE, "%s", log_buf);
      }

    if (naji_list != NULL)
      release_node_allocation(naji_list);

    return(0);
    } /* END if (all_reqs.total_nodes > 0) */

  /* SUCCESS - spec is ok */
  if (LOGLEVEL >= 6)
    {
    sprintf(log_buf, "job allocation debug(3): returning %d requested", num);

    log_record(PBSEVENT_SCHED, PBS_EVENTCLASS_REQUEST, __func__, log_buf);

    DBPRT(("%s\n", log_buf));
    }

  return(num);
  }  /* END node_spec() */




#ifdef GEOMETRY_REQUESTS
/**
 * get_bitmap
 *
 * @param pjob (I) - the job whose bitmap is be retrieved
 * @param ProcBMPtr (O) - the ptr to the string where the bitmap will be stored
 * @param ProcBMSize (I) - the size of the string ProcBMPtr points to
 * @return FAILURE if there is no specified bitmap or either pjob or ProcBMStrPtr are NULL
 * @return SUCCESS otherwise
 */
int get_bitmap(

  job  *pjob,        /* I */
  int   ProcBMSize,  /* I */
  char *ProcBMPtr)   /* O */

  {
  resource     *presc;
  resource_def *prd;

  char          LocalBM[MAX_BM];

  if ((pjob == NULL) ||
      (ProcBMPtr == NULL))
    {
    return(FAILURE);
    }

  LocalBM[0] = '\0';

  /* read the bitmap from the resource list */
  prd = find_resc_def(svr_resc_def,"procs_bitmap",svr_resc_size);
  presc = find_resc_entry(&pjob->ji_wattr[JOB_ATR_resource],prd);
  
  if ((presc != NULL) && 
      (presc->rs_value.at_flags & ATR_VFLAG_SET))
    {
    snprintf(LocalBM,sizeof(LocalBM),"%s",presc->rs_value.at_val.at_str);
    }
  else
    {
    /* fail if there was no bitmap given */

    return(FAILURE);
    }

  if (LocalBM[0] == '\0')
    {
    /* fail if there was no bitmap given */

    return(FAILURE);
    }
  else
    {
    snprintf(ProcBMPtr,sizeof(LocalBM),"%s",LocalBM);
    return(SUCCESS);
    }
  } /* end get_bitmap() */




/**
 * node_satisfies_request
 *
 * @param pnode (I) - the node to check for validity
 * @param ProcBMStr (I) - the bitmap of procs requested
 * @return TRUE - if the node satisfies the bitmap, FALSE otherwise
 * @return BM_ERROR if the bitmap isn't valid
 *
 * NOTE: must always be called by a thread already locking the pnode's mutex
 */
int node_satisfies_request(

  struct pbsnode *pnode,     /* I */
  char           *ProcBMStr) /* I */

  {
  int BMLen;
  int BMIndex;

  if (IS_VALID_STR(ProcBMStr) == FALSE)
    return(BM_ERROR);

  /* nodes are exclusive when we're using bitmaps */
  if ((pnode->nd_state != INUSE_FREE)||(pnode->nd_power_state != POWER_STATE_RUNNING))
    return(FALSE);

  BMLen = strlen(ProcBMStr);

  /* process in reverse because ProcBMStr[0] referes to core index 0 */
  BMIndex = BMLen-1;

  /* check if the requested processors are available on this node */
  for (int i = 0; i < pnode->nd_slots.get_total_execution_slots() && BMIndex >= 0; i++)
    {
    /* don't check cores that aren't requested */
    if (ProcBMStr[BMIndex--] != '1')
      continue;

    if (pnode->nd_slots.is_occupied(i) == true)
      return(FALSE);
    }

  if (BMIndex >= 0)
    {
    /* this means we didn't finish checking the string -
     * the node doesn't have enough processors */

    return(FALSE);
    }

  /* passed all checks, we're good */
  return(TRUE);
  } /* END node_satisfies_request() */




/**
 * reserve_node
 *
 * @param pnode - node to reserve
 * @param pjob - the job to be added to the node
 * @param hlistptr - a pointer to the host list 
 * @param node_info - where to save the job reservation information
 */

int reserve_node(

  struct pbsnode       *pnode,     /* I/O */
  job                  *pjob,      /* I */
  char                 *ProcBMStr, /* I */
  job_reservation_info &node_info) /* O */

  {
  if ((pnode == NULL) ||
      (pjob == NULL)  ||
      (ProcBMStr == NULL))
    {
    return(PBSE_BAD_PARAMETER);
    }

  int                   BMIndex = strlen(ProcBMStr) - 1;

  /* now reserve each node */
  for (int i = 0; i < pnode->nd_slots.get_total_execution_slots() && BMIndex >= 0; i++)
    {
    /* ignore unrequested cores */
    if (ProcBMStr[BMIndex--] != '1')
      continue;

    pnode->nd_slots.reserve_execution_slot(i, node_info.est);
    }

  if (BMIndex >= 0)
    {
    /* failure */
    return(-1);
    }
  
  job_usage_info jui(pjob->ji_internal_id);
    
  jui.est = node_info.est;
  node_info.node_id = pnode->nd_id;
  node_info.port = pnode->nd_mom_rm_port;
  pnode->nd_job_usages.push_back(jui);
    
  /* mark the node as exclusive */
  pnode->nd_state = INUSE_JOB;

  return(PBSE_NONE);
  }
#endif /* GEOMETRY_REQUESTS */

    

bool add_job_to_gpu_subnode(
    
  struct pbsnode *pnode,
  struct gpusubn *gn,
  job            *pjob)

  {
  if ((gn->job_internal_id != -1) ||
      (gn->state == gpu_unavailable))
    return(false);

  gn->job_internal_id = pjob->ji_internal_id;
  pnode->nd_ngpus_free--;

  if (pnode->nd_ngpus_to_be_used > 0)
    pnode->nd_ngpus_to_be_used--;

  return(true);
  } /* END add_job_to_gpu_subnode() */




int add_job_to_mic(

  struct pbsnode *pnode,
  int             index,
  job            *pjob)

  {
  int rc = -1;

  if (pnode->nd_micjobs[index].internal_job_id == -1)
    {
    pnode->nd_micjobs[index].internal_job_id = pjob->ji_internal_id;
    pnode->nd_nmics_free--;
    pnode->nd_nmics_to_be_used--;
    rc = PBSE_NONE;
    }

  return(rc);
  } /* END add_job_to_mic() */




int remove_job_from_nodes_mics(

  struct pbsnode *pnode,
  job            *pjob)

  {
  short i;

  for (i = 0; i < pnode->nd_nmics; i++)
    {
    if (pnode->nd_micjobs[i].internal_job_id == pjob->ji_internal_id)
      {
      pnode->nd_nmics_free++;
      pnode->nd_micjobs[i].internal_job_id = -1;
      }
    }

  return(PBSE_NONE);
  } /* END remove_job_from_nodes_mics() */



int add_gpu_to_hostlist(
  
  std::list<howl> &gpu_list,
  struct gpusubn  *gn,
  struct pbsnode  *pnode)

  {
  std::string        gpu_name(pnode->nd_name);
  static const char *suffix = "-gpu";
  bool               inserted = false;

  /* create gpu_name */
  gpu_name += suffix;

  /* initialize the pointers */
  howl h(gpu_name, pnode->nd_order, gn->index, pnode->nd_mom_rm_port);

  /* find the proper place in the list */
  for (std::list<howl>::iterator it = gpu_list.begin(); it != gpu_list.end(); it++)
    {
    if (h.order <= it->order)
      {
      inserted = true;
      gpu_list.insert(it, h);
      break;
      }
    }  /* END for (prev) */

  if (inserted == false)
    gpu_list.push_back(h);

  return(PBSE_NONE);
  } /* END add_gpu_to_hostlist() */



/*
 * checks the gpus of pnode and places them in gpu_list as necessary
 */

int place_gpus_in_hostlist(

  struct pbsnode     *pnode,
  job                *pjob,
  node_job_add_info  &naji,
  std::list<howl>    &gpu_list)

  {
  int             j;
  struct gpusubn *gn;

  char            log_buf[LOCAL_LOG_BUF_SIZE];

  /* place the gpus in the hostlist as well */
  for (j = 0; j < pnode->nd_ngpus && naji.gpu_needed > 0; j++)
    {
    sprintf(log_buf,
      "node: %s j %d ngpus %d need %d",
      pnode->nd_name,
      j,
      pnode->nd_ngpus,
      pnode->nd_ngpus_needed);
    
    if (LOGLEVEL >= 7)
      {
      log_ext(-1, __func__, log_buf, LOG_DEBUG);
      }
    DBPRT(("%s\n", log_buf));
    
    gn = pnode->nd_gpusn + j;

    if (add_job_to_gpu_subnode(pnode, gn, pjob) == false)
      continue;
    
    naji.gpu_needed--;
    
    sprintf(log_buf,
      "ADDING gpu %s/%d to exec_gpus still need %d",
      pnode->nd_name,
      j,
      pnode->nd_ngpus_needed);

    if (LOGLEVEL >= 7)
      {
      log_ext(-1, __func__, log_buf, LOG_DEBUG);
      }
    DBPRT(("%s\n", log_buf));
    
    add_gpu_to_hostlist(gpu_list, gn, pnode);
    
    /*
     * If this a real gpu in exclusive/single job mode, or a gpu in default
     * mode and the job requested an exclusive mode then we change state
     * to exclusive so we cannot assign another job to it
     */
    
    if ((pnode->nd_gpus_real) && 
        ((gn->mode == gpu_exclusive_thread) ||
         (gn->mode == gpu_exclusive_process) ||
         ((gn->mode == gpu_normal) && 
          ((gpu_mode_rqstd == gpu_exclusive_thread) ||
           (gpu_mode_rqstd == gpu_exclusive_process)))))
      {
      gn->state = gpu_exclusive;
      
      sprintf(log_buf,
        "Setting gpu %s/%d to state EXCLUSIVE for job %s",
        pnode->nd_name,
        j,
        pjob->ji_qs.ji_jobid);
      
      if (LOGLEVEL >= 7)
        {
        log_ext(-1, __func__, log_buf, LOG_DEBUG);
        }
      }
    
    /*
     * If this a real gpu in shared/default job mode and the state is
     * unallocated then we change state to shared so only other shared jobs
     * can use it
     */
    
    if ((pnode->nd_gpus_real) && (gn->mode == gpu_normal) && 
        (gpu_mode_rqstd == gpu_normal) && (gn->state == gpu_unallocated))
      {
      gn->state = gpu_shared;
      
      sprintf(log_buf,
        "Setting gpu %s/%d to state SHARED for job %s",
        pnode->nd_name,
        j,
        pjob->ji_qs.ji_jobid);
      
      if (LOGLEVEL >= 7)
        {
        log_ext(-1, __func__, log_buf, LOG_DEBUG);
        }
      }
    }

  return(PBSE_NONE);
  } /* END place_gpus_in_hostlist() */




int add_mic_to_list(

  std::list<howl> &mic_list,
  struct pbsnode  *pnode,
  int              index)

  {
  bool               inserted = false;
  static const char *mic_suffix = "-mic";
  /* create gpu_name */
  std::string        name(pnode->nd_name);
  name += mic_suffix;


  /* initialize the pointers */
  howl h(name, pnode->nd_order, index, pnode->nd_mom_rm_port);

  /* find the proper place in the list */
  for (std::list<howl>::iterator it = mic_list.begin(); it != mic_list.end(); it++)
    {
    if (h.order <= it->order)
      {
      inserted = true;
      mic_list.insert(it, h);

      break;
      }
    }  /* END for (prev) */

  if (inserted == false)
    mic_list.push_back(h);

  return(PBSE_NONE);
  } /* END add_mic_to_list() */



int place_mics_in_hostlist(

  struct pbsnode    *pnode,
  job               *pjob,
  node_job_add_info &naji,
  std::list<howl>   &mic_list)

  {
  int i;

  for (i = 0; i < pnode->nd_nmics && naji.mic_needed > 0; i++)
    {
    if (add_job_to_mic(pnode, i, pjob) == PBSE_NONE)
      {
      naji.mic_needed--;
      add_mic_to_list(mic_list, pnode, i);
      }
    }

  return(PBSE_NONE);
  } /* END place_mics_in_hostlist() */



#ifdef PENABLE_LINUX_CGROUPS

/*
 */

void save_cgroup_string_attr(

  job         *pjob,
  const char  *node_name,
  std::string &value,
  int          index)

  {
  if (pjob->ji_wattr[index].at_val.at_str == NULL)
    {
    std::string formatted(node_name);

    if (value.size() != 0)
      formatted += ":" + value;

    pjob->ji_wattr[index].at_val.at_str = strdup(formatted.c_str());
    pjob->ji_wattr[index].at_flags |= ATR_VFLAG_SET;
    }
  else
    {
    std::string concat = pjob->ji_wattr[index].at_val.at_str;
    concat += "+";
    concat += node_name;
    if (value.size() != 0)
      concat += ":" + value;

    free(pjob->ji_wattr[index].at_val.at_str);
    pjob->ji_wattr[index].at_val.at_str = strdup(concat.c_str());
    }
  } // END save_cgroup_string_attr()

/*
 * save_cpus_and_memory_cpusets()
 *
 * Adds the cpus and mems to the job's list
 */

void save_cpus_and_memory_cpusets(

  job         *pjob,
  const char  *node_name,
  cgroup_info &cgi)

  {
  save_cgroup_string_attr(pjob, node_name, cgi.cpu_string, JOB_ATR_cpuset_string);
  save_cgroup_string_attr(pjob, node_name, cgi.mem_string, JOB_ATR_memset_string);

  if (cgi.gpu_string.size() > 0)
    save_cgroup_string_attr(pjob, node_name, cgi.gpu_string, JOB_ATR_gpus_reserved);

  if (cgi.mic_string.size() > 0)
    save_cgroup_string_attr(pjob, node_name, cgi.mic_string, JOB_ATR_mics_reserved);

  } // END save_cpus_and_memory_cpusets()



/*
 * save_node_usage()
 *
 * Saves this node's usage in a file with the path path_node_usage/node_name
 *
 * @param pnode - the node whose usage state should be saved
 */

void save_node_usage(

  pbsnode *pnode)

  {
  std::stringstream  node_state;
  std::string        path(path_node_usage);
  std::string        tmp_path;
  FILE              *f = NULL;
  char               log_buf[LOCAL_LOG_BUF_SIZE];

  path += "/";
  path += pnode->nd_name;
  tmp_path = path + ".tmp";

  pnode->nd_layout->displayAsJson(node_state, true);

  if ((f = fopen(tmp_path.c_str(), "w+")) != NULL)
    {
    fprintf(f, "%s", node_state.str().c_str());
    fclose(f);
    
    unlink(path.c_str());

    if (link(tmp_path.c_str(), path.c_str()) == -1)
      {
      snprintf(log_buf, sizeof(log_buf),
        "Couldn't replace %s with new file %s in trying to save %s's state",
        path.c_str(), tmp_path.c_str(), pnode->nd_name);
      log_err(errno, __func__, log_buf);
      }
    else
      {
      // Delete the temporary file on success
      unlink(tmp_path.c_str());
      }
    }
  else
    {
    snprintf(log_buf, sizeof(log_buf),
      "Couldn't create new file to save %s's node state",
      pnode->nd_name);
    log_err(errno, __func__, log_buf);
    }
  } // END save_node_usage()



void update_req_hostlist(
    
  job        *pjob,
  const char *host_name,
  int         req_index,
  int         ppn_needed,
  int         total_ppn_in_job)

  {
  long          cray_enabled = FALSE;
  complete_req *cr;
  char          host_spec[MAXLINE];
  long          legacy_vmem = FALSE;
 
  snprintf(host_spec, sizeof(host_spec), "%s:ppn=%d", host_name, ppn_needed);

  if (pjob->ji_wattr[JOB_ATR_req_information].at_val.at_ptr == NULL)
    {
    get_svr_attr_l(SRV_ATR_LegacyVmem, &legacy_vmem);
    cr = new complete_req(pjob->ji_wattr[JOB_ATR_resource].at_val.at_list, 
                          (bool)legacy_vmem);

    pjob->ji_wattr[JOB_ATR_req_information].at_val.at_ptr = cr; 
    pjob->ji_wattr[JOB_ATR_req_information].at_flags |= ATR_VFLAG_SET;
    }
  else
    {
    cr = (complete_req *)pjob->ji_wattr[JOB_ATR_req_information].at_val.at_ptr;
    }
  
  get_svr_attr_l(SRV_ATR_CrayEnabled, &cray_enabled);

  if (cray_enabled == TRUE)
    {
    // In cray enabled mode, only create this information for the login node
    // which will have req_index 0.
    if (req_index == 0)
      {
      cr->update_hostlist(host_spec, req_index);
      }
    }
  else
    {
    cr->update_hostlist(host_spec, req_index);
    }

  } // END update_req_hostlist()



/*
 * set_gpu_mode_if_needed()
 * Sets the gpu mode if there's a default gpu mode, this job requests gpus, and this job
 * doesn't set the gpu mode
 *
 * @param pjob - the job whose gpu mode we may set
 */

void set_gpu_mode_if_needed(

  job *pjob)

  {
  char *default_gpu_mode = NULL;

  if (pjob->ji_wattr[JOB_ATR_request_version].at_val.at_long > 1)
    {
    complete_req *cr = (complete_req *)pjob->ji_wattr[JOB_ATR_req_information].at_val.at_ptr;

    if (cr != NULL)
      {
      get_svr_attr_str(SRV_ATR_DefaultGpuMode, &default_gpu_mode);

      if (default_gpu_mode != NULL)
        {
        for (unsigned int i = 0; i < cr->get_num_reqs(); i++)
          {
          req &r = cr->get_req(i);

          if ((r.get_gpus() > 0) &&
              (r.get_gpu_mode().size() == 0))
            {
            r.set_attribute(default_gpu_mode);
            }
          }
        }
      }
    }

  } // END set_gpu_mode_if_needed()
#endif



/*
 * checks the subnodes of pnode and places them in the host list
 * as necessary
 */

int place_subnodes_in_hostlist(

  job                  *pjob,
  struct pbsnode       *pnode,
  node_job_add_info    &naji,
  job_reservation_info &node_info,
  int                   total_ppn_in_job,
  char                 *ProcBMStr)

  {
  int  rc = PBSE_NONE;
  char log_buf[LOCAL_LOG_BUF_SIZE];
#ifdef GEOMETRY_REQUESTS
  if (IS_VALID_STR(ProcBMStr))
    {
    rc = reserve_node(pnode, pjob, ProcBMStr, node_info);

    if (rc == PBSE_NONE)
      {
      // nodes are used exclusively for GEOMETRY_REQUESTS
      pnode->nd_np_to_be_used = 0;
      naji.ppn_needed = 0;
      }

    return(rc);
    }

#endif

  if (pnode->nd_slots.reserve_execution_slots(naji.ppn_needed, node_info.est) == PBSE_NONE)
    {
    /* SUCCESS */
    node_info.port = pnode->nd_mom_rm_port;
    
    job_usage_info jui(pjob->ji_internal_id);
    jui.est = node_info.est;
    
    node_info.node_id = pnode->nd_id;
    pnode->nd_job_usages.push_back(jui);

    bool job_exclusive_on_use = false;
    if ((server.sv_attr[SRV_ATR_JobExclusiveOnUse].at_flags & ATR_VFLAG_SET) &&
        (server.sv_attr[SRV_ATR_JobExclusiveOnUse].at_val.at_long != 0))
      job_exclusive_on_use = true;
    
    if ((pnode->nd_slots.get_number_free() <= 0) ||
        (pjob->ji_wattr[JOB_ATR_node_exclusive].at_val.at_long == TRUE) ||
        (job_exclusive_on_use))
      pnode->nd_state |= INUSE_JOB;

#ifdef PENABLE_LINUX_CGROUPS
    cgroup_info       cgi;
    long              legacy_vmem = FALSE;
    get_svr_attr_l(SRV_ATR_LegacyVmem, &legacy_vmem);

    set_gpu_mode_if_needed(pjob);

    // We shouldn't be starting a job if the layout hasn't been set up yet.
    if (pnode->nd_layout == NULL)
      return(-1);

    update_req_hostlist(pjob, pnode->nd_name, naji.req_index, naji.ppn_needed, total_ppn_in_job);

    rc = pnode->nd_layout->place_job(pjob, cgi, pnode->nd_name, (bool)legacy_vmem);
    if (rc != PBSE_NONE)
      {
      snprintf(log_buf, sizeof(log_buf),
        "Couldn't place job %s on node %s.",
        pjob->ji_qs.ji_jobid, pnode->nd_name);
      log_err(-1, __func__, log_buf);

      return(rc);
      }

    if ((pjob->ji_wattr[JOB_ATR_node_exclusive].at_flags & ATR_VFLAG_SET) &&
        (pjob->ji_wattr[JOB_ATR_node_exclusive].at_val.at_long != 0) &&
        (((pjob->ji_wattr[JOB_ATR_request_version].at_flags & ATR_VFLAG_SET) == 0) ||
         (pjob->ji_wattr[JOB_ATR_request_version].at_val.at_long < 2)))
      {
      char buf[MAXLINE];

      if (pnode->nd_layout->getTotalThreads() > 1)
        {
        sprintf(buf, "0-%d", pnode->nd_layout->getTotalThreads() - 1);
        cgi.cpu_string = buf;
        }
      else
        cgi.cpu_string = "0";

      if (pnode->nd_layout->getTotalChips() > 1)
        {
        sprintf(buf, "0-%d", pnode->nd_layout->getTotalChips() - 1);
        cgi.mem_string = buf;
        }
      else
        cgi.mem_string = "0";
      }

    save_cpus_and_memory_cpusets(pjob, pnode->nd_name, cgi);
    save_node_usage(pnode);
#endif

    pnode->nd_np_to_be_used -= naji.ppn_needed;
    naji.ppn_needed = 0;
    }
  else
    {
    snprintf(log_buf, sizeof(log_buf),
      "Node %s doesn't have enough execution slots remaining for job %s.",
      pnode->nd_name, pjob->ji_qs.ji_jobid);
    log_err(-1, __func__, log_buf);

    rc = -1;
    }

  return(rc);
  } /* END place_subnodes_in_hostlist() */



/*
 * takes a struct howl and translates it to a string that will
 * become a job pbs_attribute (exec_hosts, exec_gpus, exec_ports)
 * NOTE: frees list (the struct howl)
 */

int translate_howl_to_string(

  std::list<howl>  &hlist,
  char             *EMsg,
  int              *NCount,
  char            **str_ptr,
  char            **portstr_ptr,
  int               port)

  {
  size_t  len = 1;
  int     count = 1;
  char   *str;
  char   *end;
  char   *portlist = NULL;
  char   *endport;

  for (std::list<howl>::iterator it = hlist.begin(); it != hlist.end(); it++)
    {
    len += it->hostname.size() + 8;
    count++;
    }

  if ((str = (char *)calloc(1, len + 1)) == NULL)
    {
    log_err(ENOMEM, __func__, "Cannot allocate memory!");

    if (EMsg != NULL)
      sprintf(EMsg,"no nodes can be allocated to job");
    
    return(PBSE_RESCUNAV);
    }

  *str = '\0';

  if (port == TRUE)
    {
    /* port list will have a string of sister port addresses */
    if ((portlist = (char *)calloc(1, (count * PBS_MAXPORTNUM) + count)) == NULL)
      {
      log_err(ENOMEM, __func__, "Cannot allocate memory!");
      
      if (EMsg != NULL)
        sprintf(EMsg,"no nodes can be allocated to job");

      free(str);
      
      return(PBSE_RESCUNAV);
      }
  
    *portlist = '\0';
    }

  /* now copy in name+name+... */
  *NCount = 0;

  end = str;
  endport = portlist;
  for (std::list<howl>::iterator it = hlist.begin(); it != hlist.end(); it++)
    {
    (*NCount)++;

    sprintf(end, "%s/%d+",
      it->hostname.c_str(),
      it->index);

    end += strlen(end);

    if (port == TRUE)
      {
      sprintf(endport, "%d+", it->port);
      endport += strlen(endport);
      }
    }

  /* strip trailing '+' and assign pointers */
  str[strlen(str) - 1] = '\0';
  *str_ptr = str;

  if (port == TRUE)
    {
    portlist[strlen(portlist) - 1] = '\0';
    *portstr_ptr = portlist;
    }

  return(PBSE_NONE);
  } /* END translate_howl_to_string() */



/*
 * populate_range_string_from_slot_tracker()
 * 
 * @post-cond: range_str is populated with a string representing the range of
 * occupied execution slots on in the job reservation info object.
 * For example, if slots 0, 2, 3, 4, and 5 and occupied the string should be 0,2-5
 */

void populate_range_string_from_slot_tracker(

  const execution_slot_tracker &est,
  std::string                  &range_str)

  {
  int               est_index;
  int               prev_est_index = -1;
  bool              consecutive_indices = false;
  int               est_iterator = -1;
  char              numbuf[10];

  range_str.clear();
    
  while ((est_index = est.get_next_occupied_index(est_iterator)) != -1)
    {
    if (consecutive_indices == false)
      {
      if (range_str.size() == 0)
        {
        snprintf(numbuf, sizeof(numbuf), "%d", est_index);
        range_str += numbuf;
        }
      else
        {
        if (prev_est_index == est_index - 1)
          consecutive_indices = true;
        else
          {
          snprintf(numbuf, sizeof(numbuf), ",%d", est_index);
          range_str += numbuf;
          }
        }
      }
    else
      {
      // currently iterating over consecutive indices
      if (prev_est_index != est_index - 1)
        {
        snprintf(numbuf, sizeof(numbuf), "-%d", prev_est_index);
        range_str += numbuf;
        consecutive_indices = false;
        snprintf(numbuf, sizeof(numbuf), ",%d", est_index);
        range_str += numbuf;
        }
      }
      
    prev_est_index = est_index;
    }

  if (consecutive_indices == true)
    {
    snprintf(numbuf, sizeof(numbuf), "-%d", prev_est_index);
    range_str += numbuf;
    }
  } /* END populate_range_string_from_slot_tracker() */



/*
 * translate_job_reservation_info_to_string()
 *
 * takes a vector of job_reservation_info objects and turns them into an exec_host list.
 * This list is in the format host1/range_str1[+host2/range_str2[+...]]
 * range_str is in the format of digit1[-digit2][,digit3[-digit4]...
 *
 * @post-cond: exec_host_output is populated with the contents of this list
 * @post-cond: exec_port_output is optionally populated with a corresponding port for each
 * entry in the host list
 */

int translate_job_reservation_info_to_string(
    
  std::vector<job_reservation_info>  &host_info, 
  int                                *NCount, 
  std::string                        &exec_host_output,
  std::stringstream                  *exec_port_output)

  {
  bool              first = true;

  for (int hi_index = 0; hi_index < (int)host_info.size(); hi_index++)
    {
    const job_reservation_info &jri = host_info[hi_index];
    std::string           range_str;
    
    (*NCount)++;
     
    if (first == false)
      {
      exec_host_output += "+";

      if (exec_port_output != NULL)
        *exec_port_output << "+";
      }

    populate_range_string_from_slot_tracker(jri.est, range_str);
    
    const char *node_id = node_mapper.get_name(jri.node_id);

    exec_host_output += node_id;
    exec_host_output += "/";
    exec_host_output  += range_str;

    if (exec_port_output != NULL)
      *exec_port_output << jri.port;

    first = false;
    } /* END for each job_reservation_info * in the vector */

  return(PBSE_NONE);
  } /* END translate_job_reservation_info_to_string() */



/*
 * external nodes refers only to nodes outside of the cray
 * for jobs that also have cray compute nodes
 */

int record_external_node(

  job            *pjob,
  struct pbsnode *pnode)

  {
  char         *external_nodes;
  unsigned int  len;

  if (pjob->ji_wattr[JOB_ATR_external_nodes].at_val.at_str == NULL)
    {
    pjob->ji_wattr[JOB_ATR_external_nodes].at_val.at_str = strdup(pnode->nd_name);
    pjob->ji_wattr[JOB_ATR_external_nodes].at_flags |= ATR_VFLAG_SET;
    }
  else
    {
    len = strlen(pjob->ji_wattr[JOB_ATR_external_nodes].at_val.at_str) + strlen(pnode->nd_name) + 2;
    external_nodes = (char *)calloc(1, len);

    snprintf(external_nodes, len, "%s+%s",
      pjob->ji_wattr[JOB_ATR_external_nodes].at_val.at_str, pnode->nd_name);

    free(pjob->ji_wattr[JOB_ATR_external_nodes].at_val.at_str);

    pjob->ji_wattr[JOB_ATR_external_nodes].at_val.at_str = external_nodes;
    }

  return(PBSE_NONE);
  } /* END record_external_node() */



/*
 * builds the hostlist based on the nodes=... part of the request
 */

int build_hostlist_nodes_req(

  job                                *pjob,      /* M */
  char                               *EMsg,      /* O */
  char                               *spec,      /* I */
  short                               newstate,  /* I */
  std::vector<job_reservation_info>  &host_info, /* O */
  std::list<howl>                    &gpu_list,  /* O */
  std::list<howl>                    &mic_list,  /* O */
  std::list<node_job_add_info>       *naji_list, /* I */
  char                               *ProcBMStr) /* I */

  {
  struct pbsnode                         *pnode = NULL;
  std::list<node_job_add_info>::iterator  it;
  char                                    log_buf[LOCAL_LOG_BUF_SIZE];
  bool                                    failure = false;
  int                                     total_ppn_in_job = 0;


  for (it = naji_list->begin(); it != naji_list->end(); it++)
    {
    total_ppn_in_job += it->ppn_needed;
    }

  for (it = naji_list->begin(); it != naji_list->end(); it++)
    {
    pnode = find_nodebyid(it->node_id);

    if (pnode != NULL)
      {
      if (failure == true)
        {
        /* just remove the marked request from the node */
        pnode->nd_np_to_be_used    -= it->ppn_needed;
        pnode->nd_ngpus_to_be_used -= it->gpu_needed;
        pnode->nd_nmics_to_be_used -= it->mic_needed;
        }
      else
        {
        int rc = PBSE_NONE;

        job_reservation_info host_single;
        rc = place_subnodes_in_hostlist(pjob, pnode, *it, host_single, total_ppn_in_job, ProcBMStr);
        if (rc == PBSE_NONE)
          {
          host_info.push_back(host_single);
          place_gpus_in_hostlist(pnode, pjob, *it, gpu_list);
          place_mics_in_hostlist(pnode, pjob, *it, mic_list);
        
          if (it->is_external == TRUE)
            {
            record_external_node(pjob, pnode);
            }
          }
        else
          failure = true;

        /* NOTE: continue through the loop if failure is true just to clean up amounts needed */

        if ((it->gpu_needed > 0) || 
            (it->ppn_needed > 0) ||
            (it->mic_needed > 0))
          {
          failure = true;
       
          /* remove any remaining things marked on the node */
          pnode->nd_np_to_be_used    -= it->ppn_needed;
          pnode->nd_ngpus_to_be_used -= it->gpu_needed;
          pnode->nd_nmics_to_be_used -= it->mic_needed;

#ifdef PENABLE_LINUX_CGROUPS
          remove_job_from_node(pnode, pjob->ji_internal_id);
#endif
          }
#ifdef PENABLE_LINUX_CGROUPS
        else if (failure == true)
          remove_job_from_node(pnode, pjob->ji_internal_id);
#endif
        }

      unlock_node(pnode, __func__, NULL, LOGLEVEL);
      }

    } /* END processing reserved nodes */
   
  if (failure == true)
    {
    /* did not satisfy the request */
    if (EMsg != NULL)
      {
      sprintf(log_buf,
        "Could not locate requested resources '%.4000s' (node_spec failed) %s",
        spec,
        EMsg);
      
      log_record(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf);
      }

    return(PBSE_RESCUNAV);
    }

  return(PBSE_NONE);
  } /* END build_hostlist_nodes_req() */




int build_hostlist_procs_req(

  job                               *pjob,     /* M */
  int                                procs,    /* I */
  short                              newstate, /* I */
  std::vector<job_reservation_info> &host_info) /* O */

  {
  int             procs_needed;
  node_iterator   iter;
  struct pbsnode *pnode = NULL;

  /* did we have a request for procs? Do those now */
  if (procs > 0)
    {
    /* check to see if a -l nodes request was made */
    if (pjob->ji_have_nodes_request)
      {
      procs_needed = procs;
      }
    else
      {
      /* the qsub request used -l procs only. No -l nodes=x
         was given in the qsub request.
         TORQUE allocates 1 node by default if a -l nodes specification
         is not given.
      */
      if (procs > 1)
        {
        procs_needed = procs - 1;
        }
      else
        procs_needed = 1;
      }
  
    reinitialize_node_iterator(&iter);

    while ((pnode = next_node(&allnodes,pnode,&iter)) != NULL)
      {
      int execution_slots_free = pnode->nd_slots.get_number_free();

      if (execution_slots_free > 0)
        {
        job_reservation_info node_info;
        if (pnode->nd_slots.reserve_execution_slots(execution_slots_free, node_info.est) == PBSE_NONE)
          {
          procs_needed -= execution_slots_free;

          host_info.push_back(node_info);
          node_info.port = pnode->nd_mom_rm_port;
          }
        }
      } /* END for each node */
    } /* if (procs > 0) */

  return(PBSE_NONE);
  } /* END build_hostlist_procs_req() */



void free_alps_req_data_array(
    
  alps_req_data *ard_array,
  int            num_reqs)

  {
  for (int i = 0; i < num_reqs; i++)
    delete ard_array[i].node_list;

  free(ard_array);
  } /* END free_alps_req_data_array() */



/*
 * add_multi_reqs_to_job() -- for Cray
 *
 * Looks at ard_array and adds the multi-req information to the job.
 * @param pjob - the job we should add the multi-req information to.
 * @param num_reqs - the number of multi-reqs
 * @param ard_array - the array holding the alps req data
 * @return PBSE_NONE if there is no data or it is added correctly
 */

int add_multi_reqs_to_job(
    
  job           *pjob,
  int            num_reqs,
  alps_req_data *ard_array)

  {
  std::string     attr_str;
  char            buf[MAXLINE];

  if (ard_array == NULL)
    return(PBSE_NONE);

  attr_str = *ard_array[0].node_list;

  for (int i = 0; i < num_reqs; i++)
    {
    if (i != 0)
      {
      attr_str += '|';
      attr_str += ard_array[i].node_list->c_str();
      }

    snprintf(buf, sizeof(buf), "*%d", ard_array[i].ppn);
    attr_str += buf;
    }

  if (pjob->ji_wattr[JOB_ATR_multi_req_alps].at_val.at_str != NULL)
    free(pjob->ji_wattr[JOB_ATR_multi_req_alps].at_val.at_str);

  pjob->ji_wattr[JOB_ATR_multi_req_alps].at_val.at_str = strdup(attr_str.c_str());
  pjob->ji_wattr[JOB_ATR_multi_req_alps].at_flags |= ATR_VFLAG_SET;

  return(PBSE_NONE);
  } /* END add_multi_reqs_to_job() */



#ifdef PENABLE_LINUX_CGROUPS
/*
 * add_entry_to_naji_list()
 *
 * Adds a new entry to the naji list equivalent to the tasks_placed tasks from req r
 *
 * @param naji_list - the list of node_add_job_info
 * @param r - the req we're using to get the placement information
 * @param hostname - the hostname where these tasks were placed
 * @param tasks_placed - the number of tasks placed from req r
 */

void add_entry_to_naji_list(

  std::list<node_job_add_info> &naji_list,
  req                          &r,
  struct pbsnode               *pnode,
  int                           tasks_placed,
  int                           req_index)

  {
  node_job_add_info naji;

  naji.node_id = pnode->nd_id;
  naji.ppn_needed = r.get_execution_slots() * tasks_placed;
  naji.gpu_needed = r.get_gpus() * tasks_placed;
  naji.mic_needed = r.getMics() * tasks_placed;
  naji.is_external = false;

  if (naji_list.size() == 0)
    naji.req_order = 0;
  else
    naji.req_order = req_index + 1;

  naji.req_index = req_index;
  
  pnode->nd_np_to_be_used    += naji.ppn_needed;
  pnode->nd_ngpus_to_be_used += naji.gpu_needed;
  pnode->nd_nmics_to_be_used += naji.mic_needed;

  naji_list.push_back(naji);
  } // END add_entry_to_naji_list()



/*
 * locate_resource_request_20_nodes()
 *
 * Makes -L work when no hostlist is given at run-time, intended for testing purposes
 *
 * NOTE: currently this doesn't work for Cray systems
 *
 * @param pjob - the job we're running
 * @param naji_list - the information to save the nodes later
 * @param ard_array - we need this if we ever want this to work for Cray
 * @param num_reqs - same thing as ard_array
 * @param job_type - same thing as ard_array
 * @return 0 on a busy failure, 1 for success. This mimics the way node_spec() returns because 
 * we fall into the same error processing code as node_spec()
 */

int locate_resource_request_20_nodes(

  job                           *pjob,
  std::list<node_job_add_info>  &naji_list,
  alps_req_data                **ard_array,
  int                           &num_reqs,
  enum job_types                &job_type)

  {
  int rc = 1; // positive numbers mean success - this needs to return like node_spec()
  complete_req *cr = (complete_req *)pjob->ji_wattr[JOB_ATR_req_information].at_val.at_ptr;
  std::set<int> used_nodes;

  // It shouldn't be possible for this to be NULL but don't segfault
  if (cr == NULL)
    return(0);

  for (int i = 0; i < cr->req_count(); i++)
    {
    req &r = cr->get_req(i);
    int  remaining_tasks = r.getTaskCount();
    
    node_iterator   iter;
    struct pbsnode *pnode = NULL;
  
    reinitialize_node_iterator(&iter);

    /* iterate over all nodes */
    while ((pnode = next_node(&allnodes, pnode, &iter)) != NULL)
      {
      // Do not re-use a node for a second req
      if (used_nodes.find(pnode->nd_id) != used_nodes.end())
        continue;

      int can_place = 0;

      if (pnode->nd_layout != NULL)
        can_place = pnode->nd_layout->how_many_tasks_can_be_placed(r);

      if (can_place != 0)
        {
        if (can_place > remaining_tasks)
          can_place = remaining_tasks;

        // For now, only mark the number we want to add. They actual placement comes later
        add_entry_to_naji_list(naji_list, r, pnode, can_place, i);

        used_nodes.insert(pnode->nd_id);

        remaining_tasks -= can_place;
      
        if (remaining_tasks == 0)
          {
          unlock_node(pnode, __func__, NULL, 10);
          break;
          }
        }
      } // END for each node

    if (remaining_tasks > 0)
      {
      // We failed to place all the tasks, return a busy error
      rc = 0;
    
      if (iter.node_index != NULL)
        delete iter.node_index;

      break;
      }
    
    if (iter.node_index != NULL)
      delete iter.node_index;
    }

  return(rc);
  } // END locate_resource_request_20_nodes()
#endif



/*
 * set_nodes() - Call node_spec() to allocate nodes then set them inuse.
 * Build list of allocated nodes to pass back in rtnlist.
 *      Return: PBS error code
 */

int set_nodes(

  job   *pjob,        /* I */
  char  *spec,        /* I */
  int    procs,       /* I */
  char **rtnlist,     /* O */
  char **rtnportlist, /* O */
  char  *FailHost,    /* O (optional,minsize=1024) */
  char  *EMsg)        /* O (optional,minsize=1024) */

  {
  FUNCTION_TIMER
  std::vector<job_reservation_info> host_info;
  std::string                       exec_hosts;
  std::stringstream                 exec_ports;
  std::list<node_job_add_info>      naji_list;
  std::list<howl>                   gpu_list;
  std::list<howl>                   mic_list;

  int                i;
  int                rc;
  int                NCount = 0;
  short              newstate;

  char              *login_prop = NULL;
  char              *gpu_str = NULL;
  char              *mic_str = NULL;
  char               ProcBMStr[MAX_BM];
  char               log_buf[LOCAL_LOG_BUF_SIZE];
  alps_req_data     *ard_array = NULL;
  int                num_reqs = 0;
  long               cray_enabled = FALSE; 
  enum job_types     job_type;

  int gpu_flags = 0;

  if (FailHost != NULL)
    FailHost[0] = '\0';

  if (EMsg != NULL)
    EMsg[0] = '\0';

  if (LOGLEVEL >= 3)
    {
    sprintf(log_buf, "allocating nodes for job %s with node expression '%.4000s'",
      pjob->ji_qs.ji_jobid,
      spec);

    log_record(PBSEVENT_SCHED, PBS_EVENTCLASS_REQUEST, __func__, log_buf);
    }

  ProcBMStr[0] = '\0';
#ifdef GEOMETRY_REQUESTS
  get_bitmap(pjob,sizeof(ProcBMStr),ProcBMStr);
#endif /* GEOMETRY_REQUESTS */

  if (pjob->ji_wattr[JOB_ATR_login_prop].at_flags & ATR_VFLAG_SET)
    login_prop = pjob->ji_wattr[JOB_ATR_login_prop].at_val.at_str;

  bool job_is_exclusive = false;
  if (pjob->ji_wattr[JOB_ATR_node_exclusive].at_flags & ATR_VFLAG_SET)
    job_is_exclusive = (pjob->ji_wattr[JOB_ATR_node_exclusive].at_val.at_long != 0);

#ifdef PENABLE_LINUX_CGROUPS
  if (!strcmp(spec, RESOURCE_20_FIND))
    {
    i = locate_resource_request_20_nodes(pjob, naji_list, &ard_array, num_reqs, job_type);
    }
  else
    {
#endif
    i = node_spec(spec, 1, 1, ProcBMStr, FailHost, &naji_list, EMsg, login_prop, &ard_array,
                  &num_reqs, job_type, job_is_exclusive);
#ifdef PENABLE_LINUX_CGROUPS
    }
#endif

  /* allocate nodes */
  if (i == 0)
    {
    /* no resources located, request failed */
    if (EMsg != NULL)
      {
      sprintf(log_buf,
        "could not locate requested resources '%.4000s' (node_spec failed) %s",
        spec,
        EMsg);

      log_record(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf);
      }

    free_alps_req_data_array(ard_array, num_reqs);

    return(PBSE_RESCUNAV);
    }
  else if (i == PBSE_LOGIN_BUSY)
    {
    free_alps_req_data_array(ard_array, num_reqs);
    return(i);
    }
  else if (i < 0)
    {
    /* request failed, corrupt request */
    log_err(PBSE_UNKNODE, __func__, "request failed, corrupt request");
    free_alps_req_data_array(ard_array, num_reqs);
    return(PBSE_UNKNODE);
    }
 
  get_svr_attr_l(SRV_ATR_CrayEnabled, &cray_enabled);
  if (cray_enabled == TRUE)
    {
    // JOB_TYPE_normal means no component from the Cray will be used
    if ((job_type != JOB_TYPE_normal) && 
        (naji_list.size() > 1))
      {
      pjob->ji_wattr[JOB_ATR_login_node_id].at_val.at_str = strdup(node_mapper.get_name(naji_list.begin()->node_id));
      pjob->ji_wattr[JOB_ATR_login_node_id].at_flags = ATR_VFLAG_SET;
      pjob->ji_wattr[JOB_ATR_login_node_key].at_val.at_long = naji_list.begin()->node_id;
      pjob->ji_wattr[JOB_ATR_login_node_key].at_flags = ATR_VFLAG_SET;
      }
    }

  newstate = INUSE_JOB;

  if ((rc = build_hostlist_nodes_req(pjob,
                                     EMsg,
                                     spec,
                                     newstate,
                                     host_info,
                                     gpu_list,
                                     mic_list,
                                     &naji_list,
                                     ProcBMStr)) != PBSE_NONE)
    {
    free_nodes(pjob, spec);
    free_alps_req_data_array(ard_array, num_reqs);
    return(rc);
    }

  if ((rc = build_hostlist_procs_req(pjob, procs, newstate, host_info)) != PBSE_NONE)
    {
    free_nodes(pjob, spec);
    free_alps_req_data_array(ard_array, num_reqs);
    return(rc);
    }

  if (host_info.empty() == true)
    {
    if (LOGLEVEL >= 1)
      {
      sprintf(log_buf, "no nodes can be allocated to job %s",
        pjob->ji_qs.ji_jobid);

      log_record(PBSEVENT_SCHED, PBS_EVENTCLASS_REQUEST, __func__, log_buf);
      }

    if (EMsg != NULL)
      sprintf(EMsg, "no nodes can be allocated to job");
    
    free_alps_req_data_array(ard_array, num_reqs);
    free_nodes(pjob, spec);

    return(PBSE_RESCUNAV);
    }  /* END if (host_info.size() == 0) */

  pjob->ji_qs.ji_svrflags |= JOB_SVFLG_HasNodes;  /* indicate has nodes */

  /* build list of allocated nodes, gpus, and ports */
  rc = translate_job_reservation_info_to_string(host_info, &NCount, exec_hosts, &exec_ports);
  if (rc != PBSE_NONE)
    {
    free_alps_req_data_array(ard_array, num_reqs);
    free_nodes(pjob, spec);
    return(rc);
    }

  *rtnlist = strdup(exec_hosts.c_str());
  *rtnportlist = strdup(exec_ports.str().c_str());

  // JOB_TYPE_normal means no component from the Cray will be used
  if ((cray_enabled == TRUE) &&
      (job_type != JOB_TYPE_normal))
    {
    char *plus = strchr(*rtnlist, '+');

    /* only do this if there's more than one host in the host list */
    if (plus != NULL)
      {
      char *to_free = *rtnlist;

      *plus = '\0';
      *rtnlist = strdup(plus + 1);
      free(to_free);
      }
    }

  if (mic_list.size() != 0)
    {
    if ((rc = translate_howl_to_string(mic_list, EMsg, &NCount, &mic_str, NULL, FALSE)) != PBSE_NONE)
      {
      free_nodes(pjob, spec);
      return(rc);
      }

    job_attr_def[JOB_ATR_exec_mics].at_free(
      &pjob->ji_wattr[JOB_ATR_exec_mics]);

    job_attr_def[JOB_ATR_exec_mics].at_decode(
      &pjob->ji_wattr[JOB_ATR_exec_mics],
      NULL,
      NULL,
      mic_str,
      0);

    free(mic_str);
    }

  if (gpu_list.size() != 0)
    {
    if ((rc = translate_howl_to_string(gpu_list, EMsg, &NCount, &gpu_str, NULL, FALSE)) != PBSE_NONE)
      {
      free_alps_req_data_array(ard_array, num_reqs);
      free_nodes(pjob, spec);
      return(rc);
      }

    job_attr_def[JOB_ATR_exec_gpus].at_free(
      &pjob->ji_wattr[JOB_ATR_exec_gpus]);
    
    job_attr_def[JOB_ATR_exec_gpus].at_decode(
      &pjob->ji_wattr[JOB_ATR_exec_gpus],
      NULL,
      NULL,
      gpu_str,
      0);  /* O */
    
    free(gpu_str);

    if (gpu_mode_rqstd != -1)
      gpu_flags = gpu_mode_rqstd;
    if (gpu_err_reset)
      gpu_flags += 1000;

    if (gpu_flags >= 0)
      {
      pjob->ji_wattr[JOB_ATR_gpu_flags].at_val.at_long = gpu_flags;
      pjob->ji_wattr[JOB_ATR_gpu_flags].at_flags = ATR_VFLAG_SET | ATR_VFLAG_MODIFY;
      
      if (LOGLEVEL >= 7)
        {
        sprintf(log_buf, "setting gpu_flags for job %s to %d %ld",
          pjob->ji_qs.ji_jobid,
          gpu_flags,
          pjob->ji_wattr[JOB_ATR_gpu_flags].at_val.at_long);

        log_ext(-1, __func__, log_buf, LOG_DEBUG);
        }
      }
    }

  if (LOGLEVEL >= 3)
    {
    snprintf(log_buf, sizeof(log_buf), "job %s allocated %d nodes (nodelist=%.4000s)",
      pjob->ji_qs.ji_jobid,
      NCount,
      *rtnlist);

    log_record(PBSEVENT_SCHED, PBS_EVENTCLASS_REQUEST, __func__, log_buf);
    }

  add_multi_reqs_to_job(pjob, num_reqs, ard_array);
  free_alps_req_data_array(ard_array, num_reqs);

  /* SUCCESS */

  return(PBSE_NONE);
  }  /* END set_nodes() */




/* count the number of requested processors in a node spec
 * return processors requested on success
 * return -1 on error 
 */ 
int procs_requested(
    
  char *spec)

  {
  char        *str;
  char        *globs;
  char        *cp;
  char        *hold;
  int          num_nodes = 0;
  int          num_procs = 1;
  int          total_procs = 0;
  int          num_gpus = 0;
  int          num_mics = 0;
  int          i;
  struct prop *prop = NULL;
  char        *tmp_spec;
  char         log_buf[LOCAL_LOG_BUF_SIZE];

  tmp_spec = strdup(spec);  
  
  if (tmp_spec == NULL)
    {
    /* FAILURE */

    sprintf(log_buf,"cannot alloc memory");

    if (LOGLEVEL >= 1)
      {
      log_record(PBSEVENT_SCHED, PBS_EVENTCLASS_REQUEST, __func__, log_buf);
      }

    return(-2);
    }

  /* Check to see if we have a global modifier */
  if ((globs = strchr(tmp_spec, '#')) != NULL)
    {
    *globs++ = '\0';

    globs = strdup(globs);

    while ((cp = strrchr(globs, '#')) != NULL)
      {
      *cp++ = '\0';

      hold = mod_spec(spec, cp);
      
      free(tmp_spec);
      
      tmp_spec = hold;
      }

    hold = mod_spec(tmp_spec, globs);
    
    free(tmp_spec);
    
    tmp_spec = hold;

    free(globs);
    }  /* END if ((globs = strchr(spec,'#')) != NULL) */

  str = tmp_spec;

  do
    {
    if ((i = number(&str, &num_nodes)) == -1)
      {
      free(tmp_spec);
      /* Bad string syntax. Fail */
      return(-1);
      }

    if (i == 0)
      {
      /* number exists */
      if (*str == ':')
        {
        /* there are properties */

        str++;

        if (proplist(&str, &prop, &num_procs, &num_gpus, &num_mics))
          {
          free(tmp_spec);
          if (prop != NULL)
            free_prop(prop);
          return(-1);
          }
        else if (prop != NULL)
          {
          free_prop(prop);
          prop = NULL;
          }
        }
      }
    else
      {
      /* no number */
      num_nodes = 1;
      if (proplist(&str, &prop, &num_procs, &num_gpus, &num_mics))
        {
        /* must be a prop list with no number in front */
        free(tmp_spec);
        if (prop != NULL)
          free_prop(prop);

        return(-1);
        }
      else if (prop != NULL)
        {
        free_prop(prop);
        prop = NULL;
        }
      }
    total_procs += num_procs * num_nodes;
    } while(*str++ == '+');
  
  free(tmp_spec);
  
  return(total_procs);
  } /* END procs_requested() */





/*
 * node_avail_complex -
 * *navail is set to number available
 * *nalloc is set to number allocated
 * *nresvd is set to number reserved
 * *ndown  is set to number down/offline
 *      return -1 on failure
 */

int node_avail_complex(

  char *spec,   /* I - node spec */
  int  *navail, /* O - number available */
  int  *nalloc, /* O - number allocated */
  int  *nresvd, /* O - number reserved  */
  int  *ndown)  /* O - number down      */

  {
  int            ret;
  enum job_types job_type;

  ret = node_spec(spec, 1, 0, NULL, NULL, NULL, NULL, NULL, NULL, NULL, job_type,false);

  *navail = ret;
  *nalloc = 0;
  *nresvd = 0;
  *ndown  = 0;

  return(ret);
  }  /* END node_avail_complex() */





/*
 * node_avail - report if nodes requested are available
 *
 * Return 0 when no error in request and
 *  *navail is set to number available
 *  *nalloc is set to number allocated
 *  *nresvd is set to number reserved
 *  *ndown  is set to number down/offline
 *      !=0 error number when error in request
 */

int node_avail(

  char *spec,  /* I  - node spec */
  int  *navail, /* O - number available */
  int *nalloc, /* O - number allocated */
  int *nresvd, /* O - number reserved  */
  int *ndown)  /* O - number down      */

  {
  int             j;
  int             holdnum;

  struct pbsnode *pn;
  char           *pc;

  struct prop    *prop = NULL;
  register int    xavail;
  register int    xalloc;
  register int    xresvd;
  register int    xdown;
  int             node_req = 1;
  int             gpu_req = 0;
  int             mic_req = 0;

  node_iterator   iter;

  if (spec == NULL)
    {
    log_event(PBSEVENT_ADMIN, PBS_EVENTCLASS_SERVER, __func__, "no spec");

    return(RM_ERR_NOPARAM);
    }

  pc = spec;

  if ((strchr(spec, (int)'+') == NULL) && (number(&pc, &holdnum) == 1))
    {
    /* A simple node spec - reply with numbers of avaiable, */
    /* allocated, reserved, and down nodes that match the */
    /* the spec, null or simple number means all  */

    xavail = 0;
    xalloc = 0;
    xresvd = 0;
    xdown  = 0;

    /* find number of a specific type of node */

    if (*pc)
      {
      if (proplist(&pc, &prop, &node_req, &gpu_req, &mic_req))
        {
        return(RM_ERR_BADPARAM);
        }
      }

    reinitialize_node_iterator(&iter);
    pn = NULL;

    while ((pn = next_node(&allnodes, pn, &iter)) != NULL)
      {
      if ((pn->nd_ntype == NTYPE_CLUSTER) && hasprop(pn, prop))
        {
        if (pn->nd_state & (INUSE_OFFLINE | INUSE_NOT_READY))
          ++xdown;
        else if (hasppn(pn, node_req, SKIP_ANYINUSE))
          ++xavail;
        else if (hasppn(pn, node_req, SKIP_NONE))
          {
          /* node has enough processors, are they busy or reserved? */
          j = pn->nd_slots.get_total_execution_slots() - pn->nd_np_to_be_used;
          
          if (j >= node_req)
            ++xresvd;
          else
            ++xalloc;
          }
        }
      } /* END for each node */

    free_prop(prop);

    *navail = xavail;

    *nalloc = xalloc;

    *nresvd = xresvd;

    *ndown  = xdown;

    return(0);
    }
  else if (number(&pc, &holdnum) == -1)
    {
    /* invalid spec */

    return(RM_ERR_BADPARAM);
    }

  /* not a simple spec - determine if supplied complex */
  /* node spec can be satisified from avail nodes */
  /* navail set to >0 if can be satified now  */
  /*    0 if not now but possible  */
  /*   -l if never possible   */

  node_avail_complex(spec, navail, nalloc, nresvd, ndown);

  return(0);
  }  /* END node_avail() */




/*
 * node_reserve - Reserve nodes
 *
 * Returns: >0 - reservation succeeded, number of nodes reserved
 *    0 - None or partial reservation
 *   -1 - requested reservation impossible
 */

int node_reserve(

  char       *nspec, /* In     - a node specification */
  resource_t  tag)   /* In/Out - tag for resource if reserved */

  {

  struct pbsnode    *pnode;
  int                ret_val;

  node_iterator      iter;
  char               log_buf[LOCAL_LOG_BUF_SIZE];
  std::list<node_job_add_info>  naji_list;
  enum job_types      job_type;

  DBPRT(("%s: entered\n", __func__))

  if ((nspec == NULL) || (*nspec == '\0'))
    {
    log_event(PBSEVENT_ADMIN, PBS_EVENTCLASS_SERVER, __func__, "no spec");

    return(-1);
    }

  if ((ret_val = node_spec(nspec, 0, 0, NULL, NULL, &naji_list, NULL, NULL, NULL, NULL, job_type,false)) >= 0)
    {
    /*
    ** Zero or more of the needed Nodes are available to be
    ** reserved.
    */
    reinitialize_node_iterator(&iter);
    pnode = NULL;

    while ((pnode = next_node(&allnodes,pnode,&iter)) != NULL)
      {
      if (pnode->nd_flag != thinking)
        {
        continue;   /* skip this one */
        }


      if (pnode->nd_np_to_be_used == pnode->nd_slots.get_total_execution_slots())
        pnode->nd_state |= INUSE_RESERVE;
      } /* END for each node */
    }
  else
    {
    /* could never satisfy the reservation */

    snprintf(log_buf, sizeof(log_buf), "can never reserve %s", nspec);

    log_record(PBSEVENT_SCHED, PBS_EVENTCLASS_REQUEST, __func__, log_buf);
    }

  return(ret_val);
  }  /* END node_reserve() */




char *get_next_exec_host(

  char **current)

  {
  FUNCTION_TIMER
  char *name_ptr = *current;
  char *plus;
  char *slash;
  char *colon;
  
  if (name_ptr != NULL)
    {
    if ((plus = strchr(name_ptr, '+')) != NULL)
      {
      *current = plus + 1;
      *plus = '\0';
      }
    else
      *current = NULL;

    if ((slash = strchr(name_ptr, '/')) != NULL)
      *slash = '\0';

    if ((colon = strchr(name_ptr, ':')) != NULL)
      *colon = '\0';
    }

  return(name_ptr);
  } /* END get_next_exec_host() */



/*
 * remove_job_from_nodes_gpus()
 *
 * @param pnode - the node whose job is being removed
 * @param pjob - the job that should be removed from the gpus
 * @return PBSE_NONE
 */

int remove_job_from_nodes_gpus(

  pbsnode *pnode,
  job     *pjob)

  {
  struct gpusubn *gn;
  char           *gpu_str = NULL;
  int             i;
  char            log_buf[LOCAL_LOG_BUF_SIZE];
 
  if (pjob->ji_wattr[JOB_ATR_exec_gpus].at_flags & ATR_VFLAG_SET)
    gpu_str = pjob->ji_wattr[JOB_ATR_exec_gpus].at_val.at_str;

  if (gpu_str != NULL)
    {
    /* reset gpu nodes */
    for (i = 0; i < pnode->nd_ngpus; i++)
      {
      gn = pnode->nd_gpusn + i;
      
      if (gn->job_internal_id == pjob->ji_internal_id)
        {
        gn->job_internal_id = -1;
        pnode->nd_ngpus_free++;
          
        if ((gn->mode == gpu_exclusive_thread) ||
            (gn->mode == gpu_exclusive_process) ||
            (gn->mode == gpu_normal))
          gn->state = gpu_unallocated;
          
        if (LOGLEVEL >= 7)
          {
          sprintf(log_buf, "freeing node %s gpu %d for job %s",
            pnode->nd_name,
            i,
            pjob->ji_qs.ji_jobid);
          
          log_record(PBSEVENT_SCHED, PBS_EVENTCLASS_REQUEST, __func__, log_buf);
          }
        }
      }
    }

  return(PBSE_NONE);
  } /* END remove_job_from_nodes_gpus() */



/*
 * remove_job_from_node()
 *
 * @param pnode - the node removeing the job
 * @param internal_job_id - the internal job id from the job to be removed
 * @return PBSE_NONE
 */

int remove_job_from_node(

  pbsnode *pnode,
  int      internal_job_id)

  {
  FUNCTION_TIMER
  char log_buf[LOCAL_LOG_BUF_SIZE];

  for (int i = 0; i < (int)pnode->nd_job_usages.size(); i++)
    {
    const job_usage_info &jui = pnode->nd_job_usages[i];

    if (jui.internal_job_id == internal_job_id)
      {
      pnode->nd_slots.unreserve_execution_slots(jui.est);
      pnode->nd_job_usages.erase(pnode->nd_job_usages.begin() + i);

      if (LOGLEVEL >= 6)
        {
        sprintf(log_buf, "increased execution slot free count to %d of %d\n",
          pnode->nd_slots.get_number_free(),
          pnode->nd_slots.get_total_execution_slots());
        
        log_record(PBSEVENT_SCHED, PBS_EVENTCLASS_REQUEST, __func__, log_buf);
        }

      pnode->nd_state &= ~INUSE_JOB;

      i--; /* the array has shrunk by 1 so we need to reduce i by one */
      }
    }

#ifdef PENABLE_LINUX_CGROUPS
  if (pnode->nd_layout != NULL)
    {
    pnode->nd_layout->free_job_allocation(job_mapper.get_name(internal_job_id));
    save_node_usage(pnode);
    }
#endif
  
  return(PBSE_NONE);
  } /* END remove_job_from_node() */




/*
 * free_nodes - free nodes allocated to a job
 *
 * First attempts to free the job from nodes that are in the exec_host string, but
 * falls back to spec if exec_host isn't specified and spec is non NULL
 */

void free_nodes(

  job        *pjob, // M
  const char *spec) // I

  {
  FUNCTION_TIMER
  struct pbsnode *pnode;

  char            log_buf[LOCAL_LOG_BUF_SIZE];
  char           *exec_hosts = NULL;
  char           *host_ptr = NULL;
  char           *hostname;

  if (LOGLEVEL >= 3)
    {
    sprintf(log_buf, "freeing nodes for job %s", pjob->ji_qs.ji_jobid);

    log_record(PBSEVENT_SCHED, PBS_EVENTCLASS_REQUEST, __func__, log_buf);
    }

  if (pjob->ji_wattr[JOB_ATR_exec_host].at_flags & ATR_VFLAG_SET)
    {
    if (pjob->ji_wattr[JOB_ATR_exec_host].at_val.at_str != NULL)
      {
      exec_hosts = strdup(pjob->ji_wattr[JOB_ATR_exec_host].at_val.at_str);
      host_ptr = exec_hosts;
      }
    }

  // Attempt to use spec if the exec host list isn't populated and spec is
  if (host_ptr == NULL)
    {
    if (spec != NULL)
      {
      exec_hosts = strdup(spec);
      host_ptr = exec_hosts;
      }
    }

  while ((hostname = get_next_exec_host(&host_ptr)) != NULL)
    {
    if ((pnode = find_nodebyname(hostname)) != NULL)
      {
      remove_job_from_node(pnode, pjob->ji_internal_id);
      remove_job_from_nodes_gpus(pnode, pjob);
      remove_job_from_nodes_mics(pnode, pjob);
      unlock_node(pnode, __func__, NULL, LOGLEVEL);
      }
    }

  free(exec_hosts);

  if (pjob->ji_wattr[JOB_ATR_login_node_id].at_val.at_str != NULL)
    {
    if ((pnode = find_nodebyname(pjob->ji_wattr[JOB_ATR_login_node_id].at_val.at_str)) != NULL)
      {
      remove_job_from_node(pnode, pjob->ji_internal_id);
      unlock_node(pnode, __func__, NULL, LOGLEVEL);
      }
    }

  pjob->ji_qs.ji_svrflags &= ~JOB_SVFLG_HasNodes;

#ifdef PENABLE_LINUX_CGROUPS
  if (pjob->ji_wattr[JOB_ATR_req_information].at_val.at_ptr != NULL)
    {
    complete_req *cr = (complete_req *)pjob->ji_wattr[JOB_ATR_req_information].at_val.at_ptr;
    if ((pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN) ||
        (pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN1) ||
        (pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN2) ||
        (pjob->ji_qs.ji_substate == JOB_SUBSTATE_RERUN3) ||
        (pjob->ji_qs.ji_substate == JOB_SUBSTATE_QUEUED) ||
        (pjob->ji_qs.ji_substate == JOB_SUBSTATE_TRNOUT))
      {
      cr->clear_allocations();
      }
    }

  if (pjob->ji_wattr[JOB_ATR_cpuset_string].at_val.at_str != NULL)
    {
    free(pjob->ji_wattr[JOB_ATR_cpuset_string].at_val.at_str);
    pjob->ji_wattr[JOB_ATR_cpuset_string].at_val.at_str = NULL;
    pjob->ji_wattr[JOB_ATR_cpuset_string].at_flags &= ~ATR_VFLAG_SET;
    }

  if (pjob->ji_wattr[JOB_ATR_memset_string].at_val.at_str != NULL)
    {
    free(pjob->ji_wattr[JOB_ATR_memset_string].at_val.at_str);
    pjob->ji_wattr[JOB_ATR_memset_string].at_val.at_str = NULL;
    pjob->ji_wattr[JOB_ATR_memset_string].at_flags &= ~ATR_VFLAG_SET;
    }
#endif

  return;
  }  /* END free_nodes() */



struct pbsnode *get_compute_node(

  const char *node_name)

  {
  struct pbsnode *ar = alps_reporter;
  struct pbsnode *compute_node = NULL;
  unsigned int    i;
  unsigned int    len = strlen(node_name);

  for (i = 0; i < len; i++)
    {
    if (isdigit(node_name[i]) == FALSE)
      {
      /* found a non-numeric character - not a compute node */
      return(NULL);
      }
    }

  lock_node(ar, __func__, NULL, LOGLEVEL);
  compute_node = create_alps_subnode(ar, node_name);
  unlock_node(ar, __func__, NULL, LOGLEVEL);

  return(compute_node);
  } /* END get_compute_node() */




/*
 * set_one_old - set a named node as allocated to a job
 */

int set_one_old(

  char *name,
  job  *pjob)

  {
  int             first;
  int             last;
  int             rc = PBSE_NONE;

  struct pbsnode *pnode;
  char           *pc;
  char           *dash;
  long            cray_enabled = FALSE;

  if ((pc = strchr(name, (int)'/')))
    {
    first = strtol(pc + 1, &dash, 10);

    *pc = '\0';

    if (*dash == '-')
      last = strtol(dash + 1, NULL, 10);
    else
      last = first;
    }
  else
    {
    first = 0;
    last = first;
    }

  get_svr_attr_l(SRV_ATR_CrayEnabled, &cray_enabled);

  pnode = find_nodebyname(name);

  if (cray_enabled == TRUE)
    {
    if (pnode == NULL)
      pnode = get_compute_node(name);

    if (pnode != NULL)
      {
      if (pnode->parent == alps_reporter)
        {
        while (last >= pnode->nd_slots.get_total_execution_slots())
          {
          add_execution_slot(pnode);
          }
        }
      }
    }

  if (pnode != NULL)
    {
    bool found = false;
    /* Mark node as being IN USE ...  */

    for (int i = 0; i < (int)pnode->nd_job_usages.size(); i++)
      {
      // this can't be a const because we change it below
      job_usage_info &jui = pnode->nd_job_usages[i];

      if (jui.internal_job_id == pjob->ji_internal_id)
        {
        found = true;

        while (last >= jui.est.get_total_execution_slots())
          jui.est.add_execution_slot();

        for (int index = first; index <= last; index++)
          {
          jui.est.mark_as_used(index);
          pnode->nd_slots.mark_as_used(index);
          }
        }
      }

    if (found == false)
      {
      job_usage_info jui(pjob->ji_internal_id);
        
      while (last >= jui.est.get_total_execution_slots())
        jui.est.add_execution_slot();

      for (int index = first; index <= last; index++)
        {
        jui.est.mark_as_used(index);
        pnode->nd_slots.mark_as_used(index);
        }

      pnode->nd_job_usages.push_back(jui);
      }

    if (pnode->nd_slots.get_number_free() <= 0)
      pnode->nd_state |= INUSE_JOB;

    unlock_node(pnode, __func__, NULL, LOGLEVEL);
    }
  else
    rc = PBSE_UNKNODE;

  return(rc);
  }  /* END set_one_old() */


/*
 * Process gpu token of the form <hostname>-gpu/<first>[-<last>]
 *
 * Set gpu subjob information for <hostname> node.
 */

int process_gpu_token(

  const char *gpu_token,
  job *pjob)

  {
  char           *pc;
  char           *dash;
  char           *p;
  int             first;
  int             last;
  struct pbsnode *pnode;

  // gpu_token expected to point to something like "numa3-gpu/2"

  if ((gpu_token == NULL) || (pjob == NULL))
    return(-1);
     
  // calculate range indices after the /
  if ((pc = strchr((char *)gpu_token, (int)'/')))
    {
    first = strtol(pc + 1, &dash, 10);

    *pc = '\0';

    if (*dash == '-')
      last = strtol(dash + 1, NULL, 10);
    else
      last = first;
    }
  else
    {
    first = 0;
    last = first;
    }

  // drop -gpu suffix
  if ((p = strrchr((char *)gpu_token, (int)'-')) != NULL)
    {
    if (strcmp(p, "-gpu") == 0)
      *p = '\0';
    }

  // lookup node and set gpu info on each gpu subnode
  if ((pnode = find_nodebyname(gpu_token)) != NULL)
    {
    int i;

    for (i = first; i <= last; i++)
      {
      struct gpusubn *gn = pnode->nd_gpusn + i;

      add_job_to_gpu_subnode(pnode, gn, pjob);
      }

      // unlock node
      unlock_node(pnode, __func__, NULL, LOGLEVEL);
    }

  return(PBSE_NONE);
  } // END process_gpu_token()

/*
 * set_old_nodes - set "old" nodes as in use - called from pbsd_init()
 * when recovering a job in the running state.
 */

int set_old_nodes(

  job *pjob)  /* I (modified) */

  {
  char     *old;
  char     *po;
  long      cray_enabled = FALSE;
  int       rc = PBSE_NONE;

  // handle gpu info
  if (pjob->ji_wattr[JOB_ATR_exec_gpus].at_flags & ATR_VFLAG_SET)
    {
    char *old_str;
    
    if ((old_str = strdup(pjob->ji_wattr[JOB_ATR_exec_gpus].at_val.at_str)) == NULL)
      {
      return(PBSE_SYSTEM);
      }

    while ((po = strrchr(old_str, (int)'+')) != NULL)
      {
      // remove +
      *po++ = '\0';

      if (process_gpu_token(po, pjob) != PBSE_NONE)
        {
        free(old_str);
        return(PBSE_SYSTEM);
        }
      }

    if (old_str != NULL)
      {
      if (process_gpu_token(old_str, pjob) != PBSE_NONE)
        {
        free(old_str);
        return(PBSE_SYSTEM);
        }
      }

    free(old_str);
    }

  if (pjob->ji_wattr[JOB_ATR_exec_host].at_flags & ATR_VFLAG_SET)
    {
    old = strdup(pjob->ji_wattr[JOB_ATR_exec_host].at_val.at_str);

    if (old == NULL)
      {
      /* FAILURE - cannot alloc memory */

      return(PBSE_SYSTEM);
      }

    while ((po = strrchr(old, (int)'+')) != NULL)
      {
      *po++ = '\0';

      if ((rc = set_one_old(po, pjob)) != PBSE_NONE)
        {
        free(old);
        return(rc);
        }
      }

    rc = set_one_old(old, pjob);
    free(old);

    if (rc != PBSE_NONE)
      return(rc);
    } /* END if pjobs exec host is set */

  /* record the job on the alps_login if cray_enabled */
  get_svr_attr_l(SRV_ATR_CrayEnabled, &cray_enabled);
  if ((cray_enabled == TRUE) &&
      (pjob->ji_wattr[JOB_ATR_login_node_id].at_flags & ATR_VFLAG_SET))
    {
    rc = set_one_old(pjob->ji_wattr[JOB_ATR_login_node_id].at_val.at_str, pjob);
    }

  return(rc);
  }  /* END set_old_nodes() */

    

job *get_job_from_job_usage_info(
    
  job_usage_info *jui,
  struct pbsnode *pnode)

  {
  job *pjob;

  tmp_unlock_node(pnode, __func__, NULL, LOGLEVEL);
  pjob = svr_find_job_by_id(jui->internal_job_id);
  tmp_lock_node(pnode, __func__, NULL, LOGLEVEL);

  return(pjob);
  }


  
job *get_job_from_jobinfo(
    
  struct jobinfo *jp,
  struct pbsnode *pnode)
  
  {
  job *pjob;

  tmp_unlock_node(pnode, __func__, NULL, LOGLEVEL);
  pjob = svr_find_job_by_id(jp->internal_job_id);
  tmp_lock_node(pnode, __func__, NULL, LOGLEVEL);

  return(pjob);
  } /* END get_job_from_jobinfo() */



/*
 * remove_temporary_hold_on_node()
 *
 */

void remove_temporary_hold_on_node(
    
  struct work_task *pwt)

  {
  pbsnode *pnode;
  char    *nd_name = (char *)pwt->wt_parm1;
  char     log_buf[LOCAL_LOG_BUF_SIZE];

  free(pwt->wt_mutex);
  free(pwt);
  
  pnode = find_nodebyname(nd_name);

  if (pnode != NULL)
    {
    snprintf(log_buf, sizeof(log_buf),
      "Node '%s' is being marked back online after a five minute break for network failures.",
      pnode->nd_name);
    log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_NODE, __func__, log_buf);
    remove_node_state_flag(pnode, INUSE_NETWORK_FAIL);

    unlock_node(pnode, __func__, NULL, LOGLEVEL);
    }

  free(nd_name);
  } // END remove_temporary_hold_on_node()



/*
 * update_failure_counts()
 *
 * Updates the internal success and failure counts for the node with the specified name
 * @param node_name - the name of the node
 * @param rc - the return code of the last network operation
 */

void update_failure_counts(
    
  const char *node_name,
  int         rc)

  {
  char     log_buf[LOCAL_LOG_BUF_SIZE];
  pbsnode *pnode = find_nodebyname(node_name);
  bool     held_node = false;

  if (pnode != NULL)
    {
    if (rc == PBSE_NONE)
      {
      pnode->nd_consecutive_successes++;

      if (pnode->nd_consecutive_successes > 1)
        {
        pnode->nd_proximal_failures = 0;

        if (pnode->nd_state & INUSE_NETWORK_FAIL)
          {
          snprintf(log_buf, sizeof(log_buf),
            "Node '%s' has had two or more consecutive network successes, marking online.",
            pnode->nd_name);
          log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_NODE, __func__, log_buf);
          remove_node_state_flag(pnode, INUSE_NETWORK_FAIL);
          }
        }
      }
    else
      {
      pnode->nd_proximal_failures++;
      pnode->nd_consecutive_successes = 0;

      if ((pnode->nd_proximal_failures > 2) &&
          ((pnode->nd_state & INUSE_NETWORK_FAIL) == 0))
        {
        snprintf(log_buf, sizeof(log_buf),
          "Node '%s' has had %d failures in close proximity, marking offline.",
          pnode->nd_name, pnode->nd_proximal_failures);
        log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_NODE, __func__, log_buf);

        update_node_state(pnode, INUSE_NETWORK_FAIL);
        held_node = true;
        }
      }

    unlock_node(pnode, __func__, NULL, LOGLEVEL);
    }

  if (held_node == true)
    {
    set_task(WORK_Timed, time(NULL) + network_fail_wait_time, remove_temporary_hold_on_node,
             strdup(node_name), FALSE);
    }
  } // END update_failure_counts()


/* END node_manager.c */

