/*
 *	Ohio Trollius
 *	Copyright 1997 The Ohio State University
 *	NJN
 *
 *	$Id: tcp.low.c,v 6.1.1.2 97/03/24 12:19:03 nevin Exp $
 *
 *	Function:	- TCP low-level routines
 */

#include <lam_config.h>

#include <errno.h>
#include <stdlib.h>
#include <fcntl.h>
#include <sys/time.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <netinet/tcp.h>
#include <sys/uio.h>
#include <unistd.h>

#if NEED_SYS_SELECT_H
#include <sys/select.h>
#endif

#include <blktype.h>
#include <dl_inet.h>
#include <mpi.h>
#include <mpisys.h>
#include <net.h>
#include <rpisys.h>
#include <terror.h>
#include <typical.h>
#include <t_types.h>

/*
 * public functions
 */
int			_tcp_advmultiple();
int			_tcp_adv1();
int			_tcp_proc_read_env();
int			_tcp_req_send_long();
int			_tcp_req_send_short();
int			_tcp_req_send_synch();
int			_tcp_req_recv();
int			_tcp_req_probe();
int			_tcp_buffered_adv();

/*
 * external functions
 */
extern int		_c2c_comm_dead();
extern void		_c2c_fill_mpi_status();
extern void		_c2c_fill_wildcards();
extern void		lam_commfault();

/*
 * private functions
 */
static int		tcp_proc_read_body();
static int		tcp_proc_read_buffer();
static int		tcp_proc_read_extra();
static int		tcp_req_send_ack_long();
static int		tcp_req_send_ack_only();
static int		tcp_req_send_body();
static int		tcp_req_rcvd_2nd();
static int		tcp_req_rcvd_body_synch();
static int		tcp_req_rcvd_ack_long();
static int		tcp_req_done();
static int		tcp_match_adv();
static int		tcp_buffer();
static int		tcp_push_body();
static int		tcp_push_env();
static int		sread();
static int		swrite();
static int		swritev();
static int		sselect();
static int		setsockblk();
static void		badsocket();

/*
 * public variables
 */
int			_tcp_nio;		/* # processes doing tcp io */
int			_tcp_sockmax;		/* max. tcp io socket num. */
fd_set			_tcp_read;		/* read sockets */
fd_set			_tcp_write;		/* write sockets */
fd_set			_tcp_except;		/* exception sockets */
fd_set			_tcp_block;		/* blocked mode socket? */
fd_set			_tcp_eoferr;		/* eof on socket is error? */
MPI_Request		_tcp_lastreq;		/* last tcp request */
struct c2c_proc		*_tcp_smap[FD_SETSIZE];	/* map socket fd to process */

/*
 * external variables
 */
extern int		_c2c_flblock;		/* blocking flag */
extern int		_c2c_haveadv;		/* have advanced? */

/*
 * private variables
 */
static struct timeval	zerotime = { 0, 0 };	/* zero timeval */

/*
 *	_tcp_advmultiple
 *
 *	Function:	- advance multiple tcp processes
 *	Accepts:	- request list
 *	Returns:	- 0 or LAMERROR
 */
int
_tcp_advmultiple()

{
	MPI_Request	req;
	int		sock;			/* socket descriptor */
	int		nready;			/* # ready sockets */
	fd_set		readfds;
	fd_set		writefds;
	fd_set		exceptfds;

	memcpy((char *) &readfds, (char *) &_tcp_read, sizeof(fd_set));
	memcpy((char *) &writefds, (char *) &_tcp_write, sizeof(fd_set));
	memcpy((char *) &exceptfds, (char *) &_tcp_except, sizeof(fd_set));

	if (_c2c_flblock) {
		nready = sselect(_tcp_sockmax + 1, &readfds, &writefds,
					&exceptfds, (struct timeval *) 0);
		if (nready <= 0) return(LAMERROR);
	}
	else {
		nready = sselect(_tcp_sockmax+1, &readfds,
					&writefds, &exceptfds, &zerotime);
		if (nready < 0) return(0);
	}
/*
 * Loop through enabled sockets and advance on each one.
 */
	for (sock = 0; nready && sock <= _tcp_sockmax; sock++) {

		if (FD_ISSET(sock, &exceptfds)) {
/*
 * Oops! An exception on the socket. Remove it from further consideration.
 */
			--nready;
			badsocket(sock);

			if (FD_ISSET(sock, &readfds)) {
				--nready;
			}
			if (FD_ISSET(sock, &writefds)) {
				--nready;
			}

			continue;
		}

		if (FD_ISSET(sock, &readfds)) {
			--nready;
			if (setsockblk(sock, FALSE)) return(LAMERROR);

			if (_tcp_smap[sock]->cp_readfn(_tcp_smap[sock])) {
				return(LAMERROR);
			}
		}

		if (FD_ISSET(sock, &writefds)) {
			--nready;
			if (setsockblk(sock, FALSE)) return(LAMERROR);

			req = _tcp_smap[sock]->cp_wreq;
			if (req->rq_rpi.c2c.cq_adv(_tcp_smap[sock], req)) {
				return(LAMERROR);
			}
		}
	}

	return(0);
}

/*
 *	_tcp_adv1
 *
 *	Function:	- advance single tcp process optimally
 *	Returns:	- 0 or LAMERROR
 */
int
_tcp_adv1()

{
    MPI_Request		req;

    if (commdead_m(_tcp_lastreq)) return(0);

    if (setsockblk(_tcp_sockmax, _c2c_flblock)) return(LAMERROR);

    if (_tcp_lastreq->rq_rpi.c2c.cq_state == C2CREAD) {
/*
 * In blocking mode EOF is considered to be an error.
 */
	if (_c2c_flblock) {
	    FD_SET(_tcp_sockmax, &_tcp_eoferr);
	}
	return(_tcp_smap[_tcp_sockmax]->cp_readfn(_tcp_smap[_tcp_sockmax]));
    }
    else {
	req = _tcp_smap[_tcp_sockmax]->cp_wreq;
	return(req->rq_rpi.c2c.cq_adv(_tcp_smap[_tcp_sockmax], req));
    }
}

/*
 *	_tcp_proc_read_env
 *
 *	Function:	- read envelope from process
 *			- if full envelope read in then try to match with
 *			  and advance a receiving request
 *	Accepts:	- process
 *	Returns:	- 0 or LAMERROR
 */
int
_tcp_proc_read_env(ps)

struct c2c_proc		*ps;

{
    int			nread;

    if ((nread = sread(ps->cp_sock, ps->cp_envbuf, ps->cp_nenvin)) <= 0) {
	if (_c2c_flblock) {
	    FD_CLR(ps->cp_sock, &_tcp_read);
	}
	return(nread);
    }
    ps->cp_nenvin -= nread;
    if (ps->cp_nenvin == 0) {
/*
 * Read complete envelope.  Reset to be ready for read of next envelope.
 */
	ps->cp_envbuf = (char *) &ps->cp_env;
	ps->cp_nenvin = sizeof(struct c2c_envl);
	if (!lam_homog) {
	    mttoli4((int4 *)&ps->cp_env, sizeof(struct c2c_envl)/sizeof(int4));
	}
	return(tcp_match_adv(ps));
    }
    else {
	ps->cp_envbuf += nread;
	return(0);
    }
}

/*
 *	tcp_proc_read_body
 *
 *	Function:	- read the body of an MPI message from process
 *			- this is only called when there is a receiving request
 *	Accepts:	- process
 *	Returns:	- 0 or LAMERROR
 */
static int
tcp_proc_read_body(ps)

struct c2c_proc		*ps;

{
    int			nread;

    if ((nread = sread(ps->cp_sock, ps->cp_msgbuf, ps->cp_nmsgin)) <= 0) {
	if (_c2c_flblock) {
	    FD_CLR(ps->cp_sock, &_tcp_read);
	}
	return(nread);
    }
    ps->cp_nmsgin -= nread;
    if (ps->cp_nmsgin == 0) {
/*
 * All of message (not including truncated data) has been read. Advance
 * the request receiving the message.
 */
	if (ps->cp_rreq->rq_rpi.c2c.cq_adv(ps, ps->cp_rreq)) {
	    return(LAMERROR);
	}
/*
 * If there is extra truncated data to read, read it otherwise set
 * process up to read the next incoming envelope.
 */
	ps->cp_rreq = 0;
	if (ps->cp_extra > 0) {
	    ps->cp_readfn = tcp_proc_read_extra;
	    return(tcp_proc_read_extra(ps));
	} else {
	    ps->cp_readfn = _tcp_proc_read_env;
	}
    }
    else {
/*
 * Still more message data to be read.
 */
	ps->cp_msgbuf += nread;
    }

    return(0);
}

/*
 *	tcp_proc_read_buffer
 *
 *	Function:	- read the body of an MPI message from process
 *			- this is called when we are buffering the message
 *	Accepts:	- process
 *	Returns:	- 0 or LAMERROR
 */
static int
tcp_proc_read_buffer(ps)

struct c2c_proc		*ps;

{
	int			nread;

	if ((nread = sread(ps->cp_sock, ps->cp_msgbuf, ps->cp_nmsgin)) <= 0) {
		if (_c2c_flblock) {
			FD_CLR(ps->cp_sock, &_tcp_read);
		}
		return(nread);
	}
	ps->cp_nmsgin -= nread;

	if (ps->cp_nmsgin == 0) {
/*
 * All of message has been buffered. Set process up to read the next
 * incoming envelope.
 */
		ps->cp_readfn = _tcp_proc_read_env;
		ps->cp_bmsg->cm_proc = 0;
	} else {
		ps->cp_msgbuf += nread;
	}

	return(0);
}

/*
 *	tcp_proc_read_extra
 *
 *	Function:	- read truncated data from process into a sink
 *	Accepts:	- process
 *	Returns:	- 0 or LAMERROR
 */
static int
tcp_proc_read_extra(ps)

struct c2c_proc		*ps;

{
	char		sink[512];		/* data sink */
	int		nread;

	do {
		nread = sread(ps->cp_sock, sink, min(ps->cp_extra, 512));
		if (nread <= 0) return(nread);
		ps->cp_extra -= nread;
	} while (ps->cp_extra > 0);
/*
 * Set process up to read the next incoming envelope.
 */
	ps->cp_readfn = _tcp_proc_read_env;
	return(0);
}

/*
 *	tcp_req_send_ack_long
 *
 *	Function:	- long protocol transition from sending ack
 *			  to reading message
 *	Accepts:	- source process
 *			- request
 *	Returns:	- 0 or LAMERROR
 */
static int
tcp_req_send_ack_long(ps, req)

struct c2c_proc		*ps;
MPI_Request		req;

{
	int		nbytes;

	if ((nbytes = tcp_push_env(ps, req)) <= 0) {
		return(nbytes);
	}

	if (req->rq_rpi.c2c.cq_nenvout == 0) {
/*
 * The ack has been sent.
 */
		_c2c_haveadv = 1;
		ps->cp_wreq = 0;
/*
 * Receive message body.
 */
		req->rq_rpi.c2c.cq_state = C2CREAD;
		req->rq_rpi.c2c.cq_env.ce_flags &= ~C2CACK;
		req->rq_rpi.c2c.cq_env.ce_flags |= C2C2ND;
		req->rq_rpi.c2c.cq_env.ce_rank = req->rq_rpi.c2c.cq_peer;
		req->rq_rpi.c2c.cq_adv = tcp_req_rcvd_2nd;
	}

	return(0);
}

/*
 *	tcp_req_send_ack_only
 *
 *	Function:	- protocol transition from sending ack to done
 *	Accepts:	- source process
 *			- request
 *	Returns:	- 0 or LAMERROR
 */
static int
tcp_req_send_ack_only(ps, req)

struct c2c_proc		*ps;
MPI_Request		req;

{
	int		nbytes;

	if ((nbytes = tcp_push_env(ps, req)) <= 0) {
		return(nbytes);
	}

	if (req->rq_rpi.c2c.cq_nenvout == 0) {
/*
 * The ack has been sent.
 */
		_c2c_haveadv = 1;
		ps->cp_wreq = 0;
		req->rq_rpi.c2c.cq_state = C2CDONE;
		req->rq_state = LAM_RQSDONE;
	}

	return(0);
}

/*
 *	tcp_req_send_body
 *
 *	Function:	- protocol transition from writing message body to done
 *	Accepts:	- destination process
 *			- request
 *	Returns:	- 0 or LAMERROR
 */
static int
tcp_req_send_body(ps, req)

struct c2c_proc		*ps;
MPI_Request		req;

{
	int		nbytes;

	if ((nbytes = tcp_push_body(ps, req)) <= 0) {
		return(nbytes);
	}

	if (req->rq_rpi.c2c.cq_nmsgout == 0) {
/*
 * All of message has been written.
 */
		_c2c_haveadv = 1;
		ps->cp_wreq = 0;
		req->rq_rpi.c2c.cq_state = C2CDONE;
		req->rq_state = LAM_RQSDONE;
	}

	return(0);
}

/*
 *	_tcp_req_send_long
 *
 *	Function:	- long protocol transition from writing first envelope
 *			  to reading ack
 *	Accepts:	- destination process
 *			- request
 *	Returns:	- 0 or LAMERROR
 */
int
_tcp_req_send_long(ps, req)

struct c2c_proc		*ps;
MPI_Request		req;

{
	int		nbytes;

	if ((nbytes = tcp_push_env(ps, req)) <= 0) {
		return(nbytes);
	}

	req->rq_state = LAM_RQSACTIVE;

	if (req->rq_rpi.c2c.cq_nenvout == 0) {
/*
 * Prepare to read long protocol ack.
 */
		_c2c_haveadv = 1;
		ps->cp_wreq = 0;
		req->rq_rpi.c2c.cq_state = C2CREAD;
		req->rq_rpi.c2c.cq_env.ce_flags |= C2CACK;
		req->rq_rpi.c2c.cq_env.ce_rank = req->rq_rpi.c2c.cq_peer;
		req->rq_rpi.c2c.cq_adv = tcp_req_rcvd_ack_long;
	}

	return(0);
}

/*
 *	tcp_req_send_short
 *
 *	Function:	- short protocol transition from writing envelope
 *			  and message body to done
 *	Accepts:	- destination process
 *			- request
 *	Returns:	- 0 or LAMERROR
 */
int
_tcp_req_send_short(ps, req)

struct c2c_proc		*ps;
MPI_Request		req;

{
	int		nbytes;

	if ((nbytes = tcp_push_body(ps, req)) <= 0) {
		return(nbytes);
	}

	req->rq_state = LAM_RQSACTIVE;

	if (req->rq_rpi.c2c.cq_nenvout == 0
			&& req->rq_rpi.c2c.cq_nmsgout == 0) {
		_c2c_haveadv = 1;
		ps->cp_wreq = 0;
		req->rq_rpi.c2c.cq_state = C2CDONE;
		req->rq_state = LAM_RQSDONE;
	}

	return(0);
}

/*
 *	tcp_req_send_synch
 *
 *	Function:	- short synchronous protocol transition from writing
 *			  envelope and message body to reading ack
 *	Accepts:	- destination process
 *			- request
 *	Returns:	- 0 or LAMERROR
 */
int
_tcp_req_send_synch(ps, req)

struct c2c_proc		*ps;
MPI_Request		req;

{
	int		nbytes;

	if ((nbytes = tcp_push_body(ps, req)) <= 0) {
		return(nbytes);
	}

	req->rq_state = LAM_RQSACTIVE;

	if (req->rq_rpi.c2c.cq_nenvout == 0
			&& req->rq_rpi.c2c.cq_nmsgout == 0) {
		_c2c_haveadv = 1;
		ps->cp_wreq = 0;
		req->rq_rpi.c2c.cq_state = C2CREAD;
		req->rq_rpi.c2c.cq_env.ce_flags |= C2CACK;
		req->rq_rpi.c2c.cq_env.ce_rank = req->rq_rpi.c2c.cq_peer;
		req->rq_rpi.c2c.cq_adv = tcp_req_done;
	}

	return(0);
}

/*
 *	tcp_req_rcvd_2nd
 *
 *	Function:	- long protocol transition from reading the envelope
 *			  at the start of the message to reading the body
 *			  of the message
 *	Accepts:	- source process
 *			- request
 *	Returns:	- 0 or LAMERROR
 */
static int
tcp_req_rcvd_2nd(ps, req)

struct c2c_proc		*ps;
MPI_Request		req;

{
	ps->cp_rreq = req;
	ps->cp_msgbuf = req->rq_packbuf;
	ps->cp_nmsgin = ps->cp_env.ce_len;
	ps->cp_readfn = tcp_proc_read_body;
	req->rq_rpi.c2c.cq_adv = tcp_req_done;
	return(tcp_proc_read_body(ps));
}

/*
 *	tcp_req_done
 *
 *	Function:	- protocol transition to done
 *	Accepts:	- source process
 *			- request
 *	Returns:	- 0 or LAMERROR
 */
static int
tcp_req_done(ps, req)

struct c2c_proc		*ps;
MPI_Request		req;

{
	_c2c_haveadv = 1;
	req->rq_rpi.c2c.cq_state = C2CDONE;
	req->rq_state = LAM_RQSDONE;
	return(0);
}

/*
 *	tcp_req_rcvd_body_synch
 *
 *	Function:	- synchronous protocol transition from reading
 *			  message body to sending synch ack
 *	Accepts:	- source process (ignored)
 *			- request
 *	Returns:	- 0 or LAMERROR
 */
static int
tcp_req_rcvd_body_synch(ps, req)

struct c2c_proc		*ps;
MPI_Request		req;

{
	_c2c_haveadv = 1;
	req->rq_rpi.c2c.cq_state = C2CWRITE;
	req->rq_rpi.c2c.cq_env.ce_flags |= C2CACK;
	req->rq_rpi.c2c.cq_env.ce_rank = req->rq_comm->c_group->g_myrank;
	req->rq_rpi.c2c.cq_adv = tcp_req_send_ack_only;
	tcp_set_out_envelope_m(req->rq_rpi.c2c);
	return(0);
}

/*
 *	tcp_req_rcvd_ack_long
 *
 *	Function:	- long protocol transition from reading ack to
 *			  done (if receiver wants 0 bytes) or sending
 *			  requested # of bytes
 *	Accepts:	- destination process
 *			- request
 *	Returns:	- 0 or LAMERROR
 */
static int
tcp_req_rcvd_ack_long(ps, req)

struct c2c_proc		*ps;
MPI_Request		req;

{
	_c2c_haveadv = 1;
/*
 * Set message length to minimum of what sender and receiver specified.
 */
	if (req->rq_packsize < ps->cp_env.ce_len) {
		req->rq_rpi.c2c.cq_nmsgout = req->rq_packsize;
	} else {
		req->rq_rpi.c2c.cq_nmsgout = ps->cp_env.ce_len;
	}

	if (req->rq_rpi.c2c.cq_nmsgout == 0) {
		req->rq_rpi.c2c.cq_state = C2CDONE;
		req->rq_state = LAM_RQSDONE;
	} else {
		req->rq_rpi.c2c.cq_state = C2CWRITE;
		req->rq_rpi.c2c.cq_env.ce_len = req->rq_rpi.c2c.cq_nmsgout;
		req->rq_rpi.c2c.cq_env.ce_flags &= ~C2CACK;
		req->rq_rpi.c2c.cq_env.ce_flags |= C2C2ND;
		req->rq_rpi.c2c.cq_env.ce_rank =
					req->rq_comm->c_group->g_myrank;
		req->rq_rpi.c2c.cq_adv = tcp_req_send_body;
		tcp_set_out_envelope_m(req->rq_rpi.c2c);
	}

	return(0);
}

/*
 *	_tcp_req_probe
 *
 *	Function:	- probe protocol transition to done
 *			- the incoming envelope/message is buffered
 *	Accepts:	- source process
 *			- request
 *	Returns:	- 0 or LAMERROR
 */
int
_tcp_req_probe(ps, req)

struct c2c_proc		*ps;
MPI_Request		req;

{
	_c2c_haveadv = 1;
	req->rq_rpi.c2c.cq_state = C2CDONE;
	req->rq_state = LAM_RQSDONE;

	_c2c_fill_mpi_status(req, ps->cp_env.ce_rank,
				ps->cp_env.ce_tag, ps->cp_env.ce_len);

	return(tcp_buffer(ps));
}

/*
 *	tcp_req_recv
 *
 *	Function:	- protocol transition for receive request on
 *			  matched incoming envelope
 *	Accepts:	- source process
 *			- request
 *	Returns:	- 0 or LAMERROR
 */
int
_tcp_req_recv(ps, req)

struct c2c_proc		*ps;
MPI_Request		req;

{
    struct c2c_envl	*env;			/* the envelope */
    int			extra;			/* # bytes to go into sink */

    req->rq_state = LAM_RQSACTIVE;
    env = &ps->cp_env;
    _c2c_fill_wildcards(req, env);

    if (env->ce_flags & C2CLONG) {
/*
 * Got a long protocol envelope.
 * Check for message length mismatch, set status and reply with an ack.
 */
	_c2c_haveadv = 1;
	if (env->ce_len > req->rq_packsize) {
	    req->rq_flags |= LAM_RQFTRUNC;
	    env->ce_len = req->rq_packsize;
	}

	_c2c_fill_mpi_status(req, env->ce_rank, env->ce_tag, env->ce_len);

	req->rq_rpi.c2c.cq_state = C2CWRITE;
	req->rq_rpi.c2c.cq_env.ce_flags |= C2CACK;
	req->rq_rpi.c2c.cq_env.ce_len = env->ce_len;
	req->rq_rpi.c2c.cq_env.ce_rank = req->rq_comm->c_group->g_myrank;
	req->rq_rpi.c2c.cq_adv =
	    (env->ce_len > 0) ? tcp_req_send_ack_long : tcp_req_send_ack_only;
	tcp_set_out_envelope_m(req->rq_rpi.c2c);
    }
    else {
/*
 * Got a short protocol envelope.
 * Check for length mismatch and set what still must be read in.
 */
	if (env->ce_len <= req->rq_packsize) {
	    extra = 0;
	} else {
	    extra = env->ce_len - req->rq_packsize;
	    env->ce_len = req->rq_packsize;
	    req->rq_flags |= LAM_RQFTRUNC;
	}

	_c2c_fill_mpi_status(req, env->ce_rank, env->ce_tag, env->ce_len);

	if (env->ce_len == 0) {
/*
 * Zero length message. Send ack if matched a synchronous send
 * otherwise complete the request.
 */
	    _c2c_haveadv = 1;
	    if (env->ce_flags & C2CSSEND) {
		if (tcp_req_rcvd_body_synch((struct c2c_proc *) 0, req)) {
		    return(LAMERROR);
		}
	    } else {
		req->rq_rpi.c2c.cq_state = C2CDONE;
		req->rq_state = LAM_RQSDONE;
	    }
	}
	else {
/*
 * Read the message body.
 */
	    ps->cp_rreq = req;
	    ps->cp_nmsgin = env->ce_len;
	    ps->cp_extra = extra;
	    ps->cp_msgbuf = req->rq_packbuf;
	    ps->cp_readfn = tcp_proc_read_body;
	    req->rq_rpi.c2c.cq_adv = (env->ce_flags & C2CSSEND) ?
				tcp_req_rcvd_body_synch : tcp_req_done;

	    return(tcp_proc_read_body(ps));
	}
    }

    return(0);
}

/*
 *	_tcp_buffered_adv
 *
 *	Function:	- protocol transition for a request matching
 *			  a buffered envelope/message
 *	Accepts:	- request
 *			- buffered envelope/message
 *	Returns:	- 0 or LAMERROR
 */
int
_tcp_buffered_adv(req, msg)

MPI_Request		req;
struct cbuf_msg		*msg;

{
    struct c2c_envl	*env;			/* matching incoming env. */
    int			len;			/* message length */
    int			extra;			/* # bytes to go into sink */
    int			nread;			/* # bytes read */

    env = &msg->cm_env;
    if (req->rq_type == LAM_RQIPROBE) {
/*
 * The request is a probe. Set the status and leave the envelope buffered.
 */
	_c2c_fill_mpi_status(req, env->ce_rank, env->ce_tag, env->ce_len);
	req->rq_state = LAM_RQSDONE;
	req->rq_rpi.c2c.cq_state = C2CDONE;

	return(0);
    }
/*
 * Special case of synchronous send with sender process = receiver.
 * Copy data directly from senders buffer and both requests are done.
 */
    if (msg->cm_req) {
	if (env->ce_len > req->rq_packsize) {
	    req->rq_flags |= LAM_RQFTRUNC;
	    env->ce_len = req->rq_packsize;
	}

	if (env->ce_len > 0) {
	    memcpy(req->rq_packbuf, msg->cm_buf, env->ce_len);
	}

	_c2c_fill_mpi_status(req, env->ce_rank, env->ce_tag, env->ce_len);
	req->rq_state = LAM_RQSDONE;
	req->rq_rpi.c2c.cq_state = C2CDONE;
	msg->cm_req->rq_state = LAM_RQSDONE;
	msg->cm_req->rq_rpi.c2c.cq_state = C2CDONE;
    }
    else if (env->ce_flags & C2CLONG) {
/*
 * Matched a long protocol envelope.
 * Check for message length mismatch, set status and reply with an ack.
 */
	if (env->ce_len > req->rq_packsize) {
	    req->rq_flags |= LAM_RQFTRUNC;
	    env->ce_len = req->rq_packsize;
	}

	_c2c_fill_mpi_status(req, env->ce_rank, env->ce_tag, env->ce_len);
	req->rq_state = LAM_RQSACTIVE;
	req->rq_rpi.c2c.cq_state = C2CWRITE;
	req->rq_rpi.c2c.cq_env.ce_flags |= C2CACK;
	req->rq_rpi.c2c.cq_env.ce_len = env->ce_len;
	req->rq_rpi.c2c.cq_env.ce_rank = req->rq_comm->c_group->g_myrank;
	req->rq_rpi.c2c.cq_adv =
	    (env->ce_len > 0) ? tcp_req_send_ack_long : tcp_req_send_ack_only;
	tcp_set_out_envelope_m(req->rq_rpi.c2c);
    }
    else {
/*
 * Matched a short protocol envelope. In this case there may be a message
 * body which may or not yet be completely read in.
 *
 * Check for length mismatch and set what still must be read in.
 */
	if (env->ce_len <= req->rq_packsize) {
	    len = env->ce_len;
	    extra = 0;
	}
	else {
	    len = req->rq_packsize;
	    extra = env->ce_len - req->rq_packsize;
	    req->rq_flags |= LAM_RQFTRUNC;
	}
	_c2c_fill_mpi_status(req, env->ce_rank, env->ce_tag, len);

	if (msg->cm_proc == 0) {
/*
 * The message has been completely buffered since there is no process
 * associated with it. Copy the message from the buffer and advance the
 * request.
 */
	    if (len) {
		memcpy(req->rq_packbuf, msg->cm_buf, len);
	    }
	    if (env->ce_flags & C2CSSEND) {
		req->rq_state = LAM_RQSACTIVE;
		if (tcp_req_rcvd_body_synch((struct c2c_proc *) 0, req)) {
		    return(LAMERROR);
		}
	    } else {
		req->rq_rpi.c2c.cq_state = C2CDONE;
		req->rq_state = LAM_RQSDONE;
	    }
	}
	else {
/*
 * There are still bytes to be read from the sender.
 * Copy into receiver's buffer what has been read so far and then
 * set the process up to read in the rest.
 */
	    nread = env->ce_len - msg->cm_proc->cp_nmsgin;
	    if (len) {
		memcpy(req->rq_packbuf, msg->cm_buf, min(nread,len));
	    }

	    if (nread < len) {
/*
 * There is still message body to be read. Change the process read state
 * which was to read everything into the system buffer. This was because
 * there was no matching receive and the receive size was unknown.
 */
		msg->cm_proc->cp_nmsgin = len - nread;
		msg->cm_proc->cp_extra = extra;
		msg->cm_proc->cp_msgbuf = req->rq_packbuf + nread;
		msg->cm_proc->cp_rreq = req;
		msg->cm_proc->cp_readfn = tcp_proc_read_body;
		req->rq_state = LAM_RQSACTIVE;
		req->rq_rpi.c2c.cq_adv = (env->ce_flags & C2CSSEND) ?
				tcp_req_rcvd_body_synch : tcp_req_done;
	    }
	    else {
/*
 * The whole message has been read. Complete the request and set the
 * process read state to read the remaining bytes into the sink.
 */
		if (env->ce_flags & C2CSSEND) {
		    req->rq_state = LAM_RQSACTIVE;
		    if (tcp_req_rcvd_body_synch((struct c2c_proc *) 0, req)) {
			return(LAMERROR);
		    }
		} else {
		    req->rq_rpi.c2c.cq_state = C2CDONE;
		    req->rq_state = LAM_RQSDONE;
		}
		msg->cm_proc->cp_extra = extra - (nread - len);
		msg->cm_proc->cp_readfn = tcp_proc_read_extra;
	    }
	}
    }
/*
 * Discard the buffered message.
 */
    _cbuf_delete(msg);

    return(0);
}

/*
 *	tcp_match_adv
 *
 *	Function:	- match env read from process with a read request
 *			  and advance the matched request
 *			- if no match is found then the env/msg is buffered
 *	Accepts:	- envelope's source process
 */
static int
tcp_match_adv(ps)

struct c2c_proc		*ps;

{
    MPI_Request		req;			/* request */
/*
 * There cannot be any matching recvs after a matching probe because
 * probes are blocking. Thus we may return upon the first match
 * (buffering the envelope in the case of a probe) and maintain the
 * invariant "no requests in the list match buffered envelopes". This
 * means once a request is in the list after being checked against
 * buffered envelopes it need never again be checked against any
 * buffered envelopes.
 */
    for (req = ps->cp_mreq; req; req = req->rq_next) {

	if ((req->rq_rpi.c2c.cq_state == C2CREAD)
		&& (!_c2c_envl_cmp(&ps->cp_env, &req->rq_rpi.c2c.cq_env))) {

	    return(req->rq_rpi.c2c.cq_adv(ps, req));
	}
    }

    return(tcp_buffer(ps));
}

/*
 *	tcp_buffer
 *
 *	Function:	- buffer incoming envelope/message
 *			- there is never any data to be read to the
 *			  data sink when buffering
 *	Accepts:	- process envelope came in from
 *	Returns:	- 0 or LAMERROR
 */
static int
tcp_buffer(ps)

struct c2c_proc		*ps;

{
	struct cbuf_msg msg;			/* buffer list entry */

	msg.cm_env = ps->cp_env;
	msg.cm_req = 0;

	if (ps->cp_env.ce_len > 0 && !(ps->cp_env.ce_flags & C2CLONG)) {
/*
 * Set up the buffer for the message body and read as much as possible.
 */
		msg.cm_buf = (char *) malloc(ps->cp_env.ce_len);
		if (msg.cm_buf == 0) return(LAMERROR);

		msg.cm_proc = ps;
		ps->cp_nmsgin = ps->cp_env.ce_len;
		ps->cp_msgbuf = msg.cm_buf;
		ps->cp_readfn = tcp_proc_read_buffer;
		ps->cp_bmsg = _cbuf_append(&msg);

		if (!ps->cp_bmsg) return(LAMERROR);

		return(tcp_proc_read_buffer(ps));
	}
	else {
		msg.cm_buf = 0;
		msg.cm_proc = 0;
		return(_cbuf_append(&msg) ? 0 : LAMERROR);
	}
}

/*
 *	tcp_push_body
 *
 *	Function:	- push request envelope and message body down the pike
 *	Accepts:	- tcp process
 *			- request
 *	Returns:	- # bytes written or LAMERROR
 */
static int
tcp_push_body(ps, req)

struct c2c_proc		*ps;
MPI_Request		req;

{
    struct iovec	iov[2];			/* IO vector */
    int			nbytes;			/* # bytes written */
    int			msgbytes;		/* # bytes msg body written */

    if (req->rq_rpi.c2c.cq_nenvout > 0 && req->rq_rpi.c2c.cq_nmsgout > 0) {

	iov[0].iov_base = (caddr_t) req->rq_rpi.c2c.cq_envbuf;
	iov[0].iov_len = req->rq_rpi.c2c.cq_nenvout;

	iov[1].iov_base = (caddr_t) req->rq_rpi.c2c.cq_msgbuf;
	iov[1].iov_len = req->rq_rpi.c2c.cq_nmsgout;

	nbytes = swritev(ps->cp_sock, iov, 2);

	if (nbytes > 0) {
	    if (nbytes >= req->rq_rpi.c2c.cq_nenvout) {
		msgbytes = nbytes - req->rq_rpi.c2c.cq_nenvout;
		req->rq_rpi.c2c.cq_nenvout = 0;
		req->rq_rpi.c2c.cq_msgbuf += msgbytes;
		req->rq_rpi.c2c.cq_nmsgout -= msgbytes;
	    } else {
		req->rq_rpi.c2c.cq_envbuf += nbytes;
		req->rq_rpi.c2c.cq_nenvout -= nbytes;
	    }
	}
    }
    else if (req->rq_rpi.c2c.cq_nmsgout > 0) {

	nbytes = swrite(ps->cp_sock,
			req->rq_rpi.c2c.cq_msgbuf, req->rq_rpi.c2c.cq_nmsgout);

	if (nbytes > 0) {
	    req->rq_rpi.c2c.cq_msgbuf += nbytes;
	    req->rq_rpi.c2c.cq_nmsgout -= nbytes;
	}
    }
    else {

	nbytes = swrite(ps->cp_sock,
			req->rq_rpi.c2c.cq_envbuf, req->rq_rpi.c2c.cq_nenvout);

	if (nbytes > 0) {
	    req->rq_rpi.c2c.cq_envbuf += nbytes;
	    req->rq_rpi.c2c.cq_nenvout -= nbytes;
	}
    }

    return(nbytes);
}

/*
 *	tcp_push_env
 *
 *	Function:	- push request envelope down the pike
 *	Accepts:	- tcp process
 *			- request
 *	Returns:	- # bytes written or LAMERROR
 */
static int
tcp_push_env(ps, req)

struct c2c_proc		*ps;
MPI_Request		req;

{
	int		nbytes;			/* # bytes written */

	nbytes = swrite(ps->cp_sock, req->rq_rpi.c2c.cq_envbuf,
					req->rq_rpi.c2c.cq_nenvout);

	if (nbytes > 0) {
		req->rq_rpi.c2c.cq_envbuf += nbytes;
		req->rq_rpi.c2c.cq_nenvout -= nbytes;
	}

	return(nbytes);
}

/*
 *	sread
 *
 *	Function:	- atomic socket read()
 *	Returns:	- # bytes read or LAMERROR
 */
static int
sread(sock, buf, nbytes)

int			sock;
char			*buf;
int			nbytes;

{
    int			nread = 0;		/* # of bytes read */
    int			r;			/* read syscall return val */
    double		starttime;		/* time blocking starts */

    if (((_kio.ki_rtf & RTF_TRON) == RTF_TRON) && FD_ISSET(sock, &_tcp_block)) {
	starttime = ttime();
    }

    do {
	r = read(sock, buf, nbytes);

	if (r < 0) {
	    if (errno == EAGAIN) {
		break;
	    }
	    if (errno != EINTR) {
		badsocket(sock);
		return(0);
	    }
	}
	else if ((r == 0) && (nbytes > 0)) {	/* eof */
	    if (FD_ISSET(sock, &_tcp_eoferr)) {
		badsocket(sock);
	    }
	    return(0);
	}
	else {
	    nread += r;
	    buf += r;
	    nbytes -= r;
	}
    } while (nbytes > 0);

    if (((_kio.ki_rtf & RTF_TRON) == RTF_TRON) && FD_ISSET(sock, &_tcp_block)) {
	_kio.ki_blktime += (ttime() - starttime);
    }

    FD_CLR(sock, &_tcp_eoferr);
    return(nread);
}

/*
 *	swrite
 *
 *	Function:	- atomic socket write()
 *	Returns:	- # bytes written or 0
 */
static int
swrite(sock, buf, nbytes)

int			sock;
char			*buf;
int			nbytes;

{
    int			nwritten = 0;		/* # of bytes written */
    int			r;			/* write syscall return val */
    double		starttime;		/* time blocking starts */
    
    if (((_kio.ki_rtf & RTF_TRON) == RTF_TRON) && FD_ISSET(sock, &_tcp_block)) {
	starttime = ttime();
    }

    do {
	r = write(sock, buf, nbytes);

	if (r < 0) {
	    if (errno == EAGAIN) {
		break;
	    }
	    if (errno != EINTR) {
		badsocket(sock);
		nwritten = 0;
		break;
	    }
	} else if ((r == 0) && (nbytes > 0)) {	/* eof */
	    badsocket(sock);
	    nwritten = 0;
	    break;
	} else {
	    nwritten += r;
	    buf += r;
	    nbytes -= r;
	}
    } while (nbytes > 0);

    if (((_kio.ki_rtf & RTF_TRON) == RTF_TRON) && FD_ISSET(sock, &_tcp_block)) {
	_kio.ki_blktime += (ttime() - starttime);
    }
	
    return(nwritten);
}

/*
 *	swritev
 *
 *	Function:	- atomic socket writev()
 *	Returns:	- #bytes written or 0
 */
static int
swritev(sock, iov, iovcnt)

int			sock;
struct iovec		*iov;
int			iovcnt;

{
    int			nwritten = 0;		/* # of bytes written */
    int			r;			/* writev syscall return val */
    int			savelen;		/* save full length */
    char		*savebase;		/* save original base ptr */
    double		starttime;		/* time blocking starts */

    if (iovcnt > 0) {
	savelen = iov->iov_len;
	savebase = iov->iov_base;
    }

    if (((_kio.ki_rtf & RTF_TRON) == RTF_TRON) && FD_ISSET(sock, &_tcp_block)) {
	    starttime = ttime();
    }
    
    do {
	r = writev(sock, iov, iovcnt);

	if (r < 0) {
	    if (errno == EAGAIN) {
		break;
	    }
	    if (errno != EINTR) {
		iov->iov_len = savelen;
		iov->iov_base = savebase;
		badsocket(sock);
		nwritten = 0;
		break;
	    }
	}
	else if (r == 0) {	/* eof */
	    iov->iov_len = savelen;
	    iov->iov_base = savebase;

	    while ((iovcnt > 0) && (iov->iov_len == 0)) {
		iov++;
		iovcnt--;
	    }

	    if (iovcnt > 0) {
		badsocket(sock);
		nwritten = 0;
	    }

	    break;
	}
	else {
	    nwritten += r;

	    while (r > 0) {

		if (r >= iov->iov_len) {
		    r -= iov->iov_len;
		    iov->iov_len = savelen;
		    iov->iov_base = savebase;
		    iov++;
		    iovcnt--;

		    if (iovcnt > 0) {
			savelen = iov->iov_len;
			savebase = iov->iov_base;
		    }
		} else {
		    iov->iov_len -= r;
		    iov->iov_base = (char *) iov->iov_base + r;
		    r = 0;
		}
	    }
	}
    } while (iovcnt > 0);

    if (((_kio.ki_rtf & RTF_TRON) == RTF_TRON) && FD_ISSET(sock, &_tcp_block)) {
	    _kio.ki_blktime += (ttime() - starttime);
    }

    return(nwritten);
}

/*
 *	sselect
 *
 *	Function:	- atomic socket select()
 *	Returns:	- # ready descriptors or LAMERROR
 */
static int
sselect(width, readfds, writefds, exceptfds, timeout)

int			width;
fd_set			*readfds;
fd_set			*writefds;
fd_set			*exceptfds;
struct timeval		*timeout;

{
	int		nready;			/* number ready descriptors */
	double		starttime;		/* time blocking starts */

	if (((_kio.ki_rtf & RTF_TRON) == RTF_TRON) && (timeout == 0)) {
		starttime = ttime();
	}

	do {
		nready = select(width, readfds, writefds, exceptfds, timeout);

		if (nready < 0 && errno != EINTR) return(LAMERROR);
	} while (nready < 0);

	if (((_kio.ki_rtf & RTF_TRON) == RTF_TRON) && (timeout == 0)) {
		_kio.ki_blktime += (ttime() - starttime);
	}

	return(nready);
}

/*
 *	setsockblk
 *
 *	Function:	- set socket to blk/non-blk mode
 *	Accepts:	- socket descriptor
 *			- blocking flag
 *	Returns:	- 0 or LAMERROR
 */
static int
setsockblk(sock, fl_block)

int			sock;
int			fl_block;

{
	if (fl_block) {
/*
 * Switch socket to blocking if needed.
 */
		if (!FD_ISSET(sock, &_tcp_block)) {
			if (fcntl(sock, F_SETFL, 0)) return(LAMERROR);
			FD_SET(sock, &_tcp_block);
		}
	}
	else {
/*
 * Switch socket to non-blocking if needed.
 */
		if (FD_ISSET(sock, &_tcp_block)) {
			if (fcntl(sock, F_SETFL, O_NONBLOCK)) return(LAMERROR);
			FD_CLR(sock, &_tcp_block);
		}
	}

	return(0);
}

/*
 *	badsocket
 *
 *	Function:	- invalidate communicators with processes on the node
 *			  a bad socket is connected to
 *	Accepts:	- bad socket descriptor
 */
static void
badsocket(sock)

int			sock;

{
	struct _proc	*p;			/* process */

	FD_CLR(sock, &_tcp_read);
	FD_CLR(sock, &_tcp_write);
	FD_CLR(sock, &_tcp_except);

	for (p = lam_topproc(); p; p = lam_nextproc()) {

		if (p->p_rpi.c2c.cp_sock == sock) {
			lam_commfault(p->p_gps.gps_node);

			if (p->p_rpi.c2c.cp_rreq) {
				(void) commdead_m(p->p_rpi.c2c.cp_rreq);
			}
			if (p->p_rpi.c2c.cp_wreq) {
				(void) commdead_m(p->p_rpi.c2c.cp_wreq);
			}
		}
	}
}
