/*
 *	Ohio Trollius
 *	Copyright 1996 The Ohio State University
 *	NJN
 *
 *	$Id: mpil_spawn.c,v 6.1 96/11/23 22:53:30 nevin Rel $
 *
 *	Function:	- spawn MPI processes
 */

#include <errno.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>

#include <app_mgmt.h>
#include <app_schema.h>
#include <args.h>
#include <blktype.h>
#include <mpi.h>
#include <mpisys.h>
#include <net.h>
#include <portable.h>
#include <rpisys.h>
#include <sfh.h>
#include <terror.h>
#include <typical.h>

/*
 * public functions
 */
int			MPIL_Comm_parent();
int			MPIL_Spawn();
int			MPIL_Universe_size();

/*
 * private functions
 */
static int		spawn();
static char		*locate_aschema();

/*
 *	MPIL_Spawn
 *
 *	Function:	- spawn MPI processes
 *	Accepts:	- application schema
 *			- root process rank
 *			- parent communicator
 *			- intercomm between parents and children (returned)
 *	Returns:	- MPI_SUCCESS or error code
 */
int
MPIL_Spawn(comm, schema, root, intercomm)

MPI_Comm 		comm;
char			*schema;
int			root;
MPI_Comm 		*intercomm;

{
	MPI_Group	kgrp;			/* child group */
	MPI_Status	stat;
	struct _proc	**p;
	struct _gps	*g;
	struct _gps	*kids;			/* array of child GPS */
	int		rank;			/* caller rank */
	int		size;			/* group size */
	int		err;			/* error code */
	int		numkids;		/* num. of children spawned */
	int		mycid;			/* local max context ID */
	int		cid;			/* context ID for intercomm */
	int		msg[2];			/* two int message buffer */
	int		i;

	lam_initerr();
	lam_setfunc(BLKMPILSPAWN);
/*
 * Check the arguments.
 */
	if ((comm == MPI_COMM_NULL) || LAM_IS_INTER(comm)) {
		return(lam_errfunc(comm, BLKMPILSPAWN,
				lam_mkerr(MPI_ERR_COMM, 0)));
	}

	MPI_Comm_size(comm, &size);

	if ((root >= size) || (root < 0)) {
		return(lam_errfunc(comm, BLKMPILSPAWN,
				lam_mkerr(MPI_ERR_ROOT, 0)));
	}

	if ((intercomm == 0) || (schema == 0)) {
		return(lam_errfunc(MPI_COMM_WORLD,
			BLKMPILSPAWN, lam_mkerr(MPI_ERR_ARG, 0)));
	}

	LAM_TRACE(lam_tr_cffstart(BLKMPILSPAWN));
/*
 * Set debugging parameters.
 */
	g = &(comm->c_group->g_procs[root]->p_gps);

	lam_setparam(BLKMPILSPAWN, root | (g->gps_grank << 16),
				(g->gps_node << 16) | g->gps_idx);
/*
 * Synchronize all members of the parent group and get the context ID
 * for the parent-child intercommunicator.
 */
	MPI_Comm_rank(comm, &rank);

	mycid = lam_getcid();

	if (mycid < 0) {
		return(lam_errfunc(comm, BLKMPILSPAWN,
				lam_mkerr(MPI_ERR_INTERN, EFULL)));
	}

	err = MPI_Reduce(&mycid, &cid, 1, MPI_INT, MPI_MAX, root, comm);
	if (err != MPI_SUCCESS) {
		return(lam_errfunc(comm, BLKMPILSPAWN, err));
	}
	
	if (rank == root) {
/*
 * The root does the process spawning.
 */
		if (spawn(schema, comm, cid, &numkids, &kids)) {
/*
 * Inform parent group of spawn error.
 */
			err = lam_mkerr(MPI_ERR_OTHER, errno);
			
			msg[0] = -1; msg[1] = err;
			
			MPI_Bcast(msg, 2, MPI_INT, root, comm);
			
			return(lam_errfunc(comm, BLKMPILSPAWN, err));
		}
		
		msg[0] = cid; msg[1] = numkids;
	}
/*
 * Broadcast the context ID for the parent-child intercommunicator and the
 * number of children spawned to the parents. In the case of an error
 * in spawning we are actually broadcasting the error code.
 */
	err = MPI_Bcast(msg, 2, MPI_INT, root, comm);
	if (err != MPI_SUCCESS) {
		return(lam_errfunc(comm, BLKMPILSPAWN, err));
	}
	
	if (rank != root) {

		cid = msg[0];
/*
 * A context ID of -1 means an error occurred in spawning so we
 * return with the error.
 */
		if (cid == -1) {
			err = msg[1];
			return(lam_errfunc(comm, BLKMPILSPAWN, err));
		}
/*
 * Allocate buffer to receive array of child GPS.
 */
		numkids = msg[1];
		kids = (struct _gps *)
			malloc((unsigned) (numkids * sizeof(struct _gps)));
		if (kids == 0) {
			return(lam_errfunc(comm, BLKMPILSPAWN,
					lam_mkerr(MPI_ERR_OTHER, errno)));
		}
	}
/*
 * Broadcast the array of child GPS to parent group.
 */
	err = MPI_Bcast(kids, numkids * sizeof(struct _gps) / sizeof(int),
			MPI_INT, root, comm);
	if (err != MPI_SUCCESS) {
		free((char *) kids);
		return(lam_errfunc(comm, BLKMPILSPAWN, err));
	}
/*
 * Create the child group.
 */
	kgrp = (MPI_Group) malloc((unsigned) (sizeof(struct _group) +
					(numkids * sizeof(struct _proc *))));
	if (kgrp == 0) {
		free((char *) kids);
		return(lam_errfunc(comm, BLKMPILSPAWN,
					lam_mkerr(MPI_ERR_OTHER, errno)));
	}
	kgrp->g_nprocs = numkids;
	kgrp->g_myrank = MPI_UNDEFINED;
	kgrp->g_refcount = 1;
	kgrp->g_procs = (struct _proc **)
				((char *) kgrp + sizeof(struct _group));

	g = kids;
	p = kgrp->g_procs;

	for (i = 0; i < numkids; ++i, ++p, ++g) {

		*p = lam_procadd(g);
		if (*p == 0) {
			free((char *) kids);
			free((char *) kgrp);
			return(lam_errfunc(comm, BLKMPILSPAWN,
					lam_mkerr(MPI_ERR_OTHER, errno)));
		}
		(*p)->p_mode |= LAM_PCLIENT;
		(*p)->p_refcount++;
	}
/*
 * Create the parent-child intercommunicator.
 */
	*intercomm = 0;
	if (lam_comm_new(cid, comm->c_group, kgrp, LAM_CINTER, intercomm)) {
		free((char *) kids);
		free((char *) kgrp);
		return(lam_errfunc(comm, BLKMPILSPAWN,
				lam_mkerr(MPI_ERR_OTHER, errno)));
	}

	comm->c_group->g_refcount++;
	(*intercomm)->c_errhdl = comm->c_errhdl;
	comm->c_errhdl->eh_refcount++;

	if (!al_insert(lam_comms, intercomm)) {
		return(lam_errfunc(comm, BLKMPILSPAWN,
				lam_mkerr(MPI_ERR_INTERN, errno)));
	}

	if (lam_tr_comm(*intercomm)) {
		return(lam_errfunc(comm, BLKMPILSPAWN,
				lam_mkerr(MPI_ERR_INTERN, errno)));
	}

	lam_setcid(cid);
/*
 * setup new processes
 */
	if (RPI_SPLIT(_rpi_lamd_addprocs, _rpi_c2c_addprocs, ())) {
		return(lam_errfunc(comm, BLKMPILSPAWN,
				lam_mkerr(MPI_ERR_OTHER, errno)));
	}
/*
 * Wait until all the children have initialized.
 * The root waits for rank 0 in the child world to communicate this fact and
 * then broadcasts it to the other parents.
 */
	if (rank == root) {
		err = MPI_Recv((void *) 0, 0,
				MPI_BYTE, 0, 0, *intercomm, &stat);
		if (err != MPI_SUCCESS) {
			return(lam_errfunc(comm, BLKMPILSPAWN, err));
		}
	}

	err = MPI_Bcast((void *) 0, 0, MPI_BYTE, root, comm);
	
	LAM_TRACE(lam_tr_cffend(BLKMPILSPAWN, root, comm, 0, 0));
	
	lam_resetfunc(BLKMPILSPAWN);
	return(MPI_SUCCESS);
}

/*
 *	MPIL_Comm_parent
 *
 *	Function:	- returns the parent inter-communicator
 *	Accepts:	- communicator (out)
 */
int
MPIL_Comm_parent(comm)

MPI_Comm 		*comm;

{
	lam_initerr();
	lam_setfunc(BLKMPILCOMMPARENT);
/*
 * Check the arguments.
 */
	if (comm == 0) {
		return(lam_errfunc(MPI_COMM_WORLD, BLKMPILCOMMPARENT,
				lam_mkerr(MPI_ERR_ARG, 0)));
	}
/*
 * Set the intercomm.
 */
	*comm = MPI_COMM_PARENT;
	
	lam_resetfunc(BLKMPILCOMMPARENT);
	return(MPI_SUCCESS);
}

/*
 *	MPIL_Universe_size
 *
 *	Function:	- returns the number of nodes
 *	Accepts:	- number of nodes (out)
 */
int
MPIL_Universe_size(size)

int			*size;

{
	lam_initerr();
	lam_setfunc(BLKMPILUNIVERSESIZE);
/*
 * Check the arguments.
 */
	if (size == 0) {
		return(lam_errfunc(MPI_COMM_WORLD, BLKMPILUNIVERSESIZE,
				lam_mkerr(MPI_ERR_ARG, 0)));
	}
/*
 * Set the intercomm.
 */
	*size = getncomp();

	if (*size < 0) {
		return(lam_errfunc(MPI_COMM_WORLD, BLKMPILUNIVERSESIZE,
				lam_mkerr(MPI_ERR_ARG, errno)));
	}
	
	lam_resetfunc(BLKMPILUNIVERSESIZE);
	return(MPI_SUCCESS);
}

/*
 *	spawn
 *
 *	Function:	- spawn MPI processes according to schema
 *	Accepts:	- schema
 *			- parent communicator
 *			- context ID for parent-child intercommunicator
 *			- ptr number of children (returned)
 *			- ptr array of child GPS (returned)
 *	Returns:	- 0 or LAMERROR
 */
static int
spawn(schema, comm, cid, numkids, kids)

char			*schema;
MPI_Comm		comm;
int			cid;
int			*numkids;
struct _gps		**kids;

{
	struct nmsg	nhead;			/* network msg header */
	struct _proc	**g;			/* ptr process in group */
	struct _gps	*procgps;		/* procs. GPS */
	struct _gps	*p;			/* favourite pointer */
	LIST		*app;			/* application */
	LIST		*app_sched;		/* scheduled application */
	char		*aschema;		/* application schema */
	int		nparent;		/* size of parent world */
	int		nworld;			/* size of child world */
	int4		rtf;			/* child runtime flags */
	int		rank;			/* my (spawner's) rank */
	int		ignore;			/* ignored argument */
	int		i;
/*
 * If schema is a single argument then it is an application schema file,
 * otherwise it is an explicit specification of what to execute like that
 * given to mpirun.
 */
	if (strchr(schema, ' ')) {
		app = asc_bufparse(schema, strlen(schema), &ignore);
	} else {
		aschema = locate_aschema(schema);

		if (aschema == 0) {
			errno = EINVAL;
			return(LAMERROR);
		}

		app = asc_parse(aschema, &ignore);
	}
	
	if (app == 0) {
		errno = EUSAGE;
		return(LAMERROR);
	}

	app_sched = asc_schedule(app);
	asc_free(app);
	if (app_sched == 0) {
		return(LAMERROR);
	}
/*
 * Allocate child and parent GPS array.
 */
	MPI_Comm_size(comm, &nparent);
	nworld = al_count(app_sched);
	procgps = (struct _gps *)
		malloc((unsigned) (nworld + nparent) * sizeof(struct _gps));
	if (procgps == 0) {
		asc_free(app_sched);
		return(LAMERROR);
	}
/*
 * Set environment inherited by children.  The world spawning them consists
 * solely of the parent group.
 */
	rtf = _kio.ki_rtf;
	rtf &= ~(RTF_MPIRUN | RTF_TRON | RTF_FLAT);
/*
 * Run the application.
 */
	if (asc_run(app_sched, nparent, rtf, 0, 0, procgps)) {
		asc_free(app_sched);
		free((char *) procgps);
		return(LAMERROR);
	}
	asc_free(app_sched);
/*
 * Fill in child ranks in their MPI_COMM_WORLD.
 */
	for (i = 0, p = procgps; i < nworld; ++i, ++p) {
		p->gps_grank = i;
	}
/*
 * Fill in the parent world GPS.
 */
	g = comm->c_group->g_procs;

	for (i = 0; i < nparent; ++i, ++p, ++g) {
		*p = (*g)->p_gps;
	}
/*
 * Set up the message.
 */
	MPI_Comm_rank(comm, &rank);
	nhead.nh_type = 0;
	nhead.nh_flags = DINT4MSG;
	nhead.nh_msg = (char *) procgps;
	nhead.nh_length = (nworld + nparent) * sizeof(struct _gps);
	nhead.nh_data[1] = (int4) cid;
	nhead.nh_data[2] = (int4) rank;
	nhead.nh_data[3] = (int4) lam_universe_size;
/*
 * Loop sending to each process.
 */
	for (i = 0, p = procgps; i < nworld; ++i, ++p) {
		nhead.nh_node = p->gps_node;
		nhead.nh_event = - p->gps_pid;
		if (nsend(&nhead)) {
			free((char *) procgps);
			return(LAMERROR);
		}
	}

	*numkids = nworld;
	*kids = procgps;

	return(0);
}

/*
 *	locate_aschema
 *
 *	Function:	- locate an application schema
 *	Accepts:	- filename
 *	Returns:	- full pathname or NULL
 */
static char *
locate_aschema(filename)

char			*filename;

{
	int		pathc = 0;		/* paths argc */
	char		**pathv = 0;		/* paths argv */
	char		*appdir;		/* application dir */
	char		*fullpath;		/* full pathname */

	if ((appdir = getenv("LAMAPPLDIR"))) {
		argvadd(&pathc, &pathv, appdir);
	}
	argvadd(&pathc, &pathv, "");
	argvadd(&pathc, &pathv, "$LAMHOME/boot");
	argvadd(&pathc, &pathv, "$TROLLIUSHOME/boot");

	fullpath = sfh_path_find(filename, pathv, R_OK);
	argvfree(pathv);
	return(fullpath);
}
