/* $Id: clique_protocol.c,v 1.217 2004/12/02 04:12:42 graziano Exp $ */

#include "config_nws.h"
#include <sys/types.h>       /* required for ntohl() on Cray */
#include <math.h>            /* fabs() */
#include <signal.h>          /* kill() */
#include <stdio.h>           /* sprintf() */
#include <stdlib.h>          /* free(), {m,re}alloc() */
#include <string.h>          /* string functions */
#include <netinet/in.h>      /* htonl() ntohl() */

#include "cliques.h"         /* clique messages and structures */
#include "diagnostic.h"      /* FAIL() WARN() LOG() */
#include "dnsutil.h"         /* IP/DNS conversion */
#include "experiments.h"    /* experiment structure manipulation */
#include "host_protocol.h"   /* host connection protocol */
#include "messages.h"        /* RegisterListener() */
#include "osutil.h"          /* CurrentTime() */
#include "skills.h"          /* skill invocation */
#include "strutil.h"         /* GETTOK() SAFESTRNCPY(), vstrncpy() */
#include "clique_protocol.h" /* spec for this code */
#include "nws_sensor.h"

#include "nws_api.h"

#define DEFAULT_CLIQUE_PERIOD "120"
#define KEEP_A_LONG_TIME 315360000.0
/* inital factor to multiply period by for clique t/o */
#define GAIN 0.15

/*
 * TokenInfo caches information about the token for a clique.  The #clique#
 * field holds the clique itself; this is what is passed between sensors.  The
 * #state# field indicates whether the token has been stopped (TOKEN_STOPPED,
 * e.g. via a CLIQUE_DIE method -- we retain the token so that, if a fellow
 * clique member passes us a stale token, we know not to restart the clique),
 * is presently in our possession (TOKEN_HELD), or has been passed onto a
 * fellow clique member (TOKEN_PASSED).  If we are forking child processes to
 * do experiments, then #delegatePid# holds the pid of the child process.
 * #myIndex# holds the index of this host's address in the clique host table;
 * #nextHost# the index of the clique member we need to try to
 * contact.  #expectedCycleTime# holds a forecaststate for the expected
 * cycle time.  #cycleStartTime# holds the time when, the last time we
 * had the token, we tried to contact our first fellow member.  This is
 * used to compute the cycle time when we again receive the token.
 * #nextRunTime# holds the time we wish to begin the next cycle.  For the
 * member that starts the cycle (the leader), this is the time the
 * previous cycle began plus the clique period; for all others, it's the
 * previous cycle start plus the clique time-out (which is always >= than
 * the period). 
 *
 * As of version 2.10 we use cycleStartTime for when
 * the token is stopped: it registers the time the sensor sent the
 * CLIQUE_DIE message for the clique (to not get into a loop). 
 *
 * As of version 2.10 we added #lastMeasurement# which registers the
 * last time we've started to take a round of measurements.
 *
 * As of version 2.10 we added #list# which contains the sequence of
 * indices of fellow member. The sequence is refreshed every time to
 * avoid to do experiment in the same order.
 *
 * As of version 2.11 we record the skill we use for this clique in
 * #skill#.
 */
enum TokenState {TOKEN_HELD, TOKEN_PASSED, TOKEN_STOPPED};

typedef struct {
	Clique clique;
	enum TokenState state;
	pid_t delegatePid;
	unsigned int myIndex;
	unsigned int nextHost;
	unsigned int *list;
	NWSAPI_ForecastState *expectedCycleTime;
	double cycleStartTime;
	double nextRunTime;
	double lastMeasurement;
	KnownSkills skill;
} TokenInfo;

/* Module globals */
static void *lock = NULL;		/* local lock */
static TokenInfo *cliqueTokens = NULL;
static int cliquesKnown = 0;
static int doFork = 1;

/*
** NOTE: We carry host addresses in Cliques (Clique.host[n].address) in network
** format so that we can use the dnsutil functions on them.  This is a bit
** dirty, since we shouldn't be aware here of details of IPAddress as defined
** in dnsutil.h, and since it requires us to undo the host/network translation
** done by the message send/receive routines.  However, it seems the least
** ugly of an unfortunate set of choices, and we isolate the dirt by
** encapsulating the translation in {Recv,Send}Clique().
*/

/*
 * Searches the host address table of #cliqueD# for an entry corresponding to
 * an address and port this sensor listens to.  If found, returns 1 and sets
 * #indx# (if not NULL) to the table index; else returns 0. 
 */
static int
FindMe(		const Clique *cliqueD,
       		unsigned int *indx) {
	int i;
	IPAddress addr;

	for(i = 0; i < cliqueD->count; i++) {
		addr.addr = cliqueD->members[i].address;
		if (EstablishedInterface(addr, cliqueD->members[i].port)) {
			if (indx != NULL) {
				*indx = i;
			}
			return 1;
		}
	}
	WARN1("FindMe: not found in clique %s\n", cliqueD->name);

	return 0;
}

/*
 * Searches the cliqueTokens array for a clique named #cliqueName#.  Returns 1
 * and sets #indx# to the matching array index if found, else returns 0.
 */
static int
FindTokenInfo(const char *cliqueName,
              unsigned int *indx) {
	unsigned int i, ret;

	ret = 0;
	GetNWSLock(&lock);
	for(i = 0; i < cliquesKnown && !ret; i++) {
		if (strncmp(cliqueName, cliqueTokens[i].clique.name, MAX_CLIQUE_NAME_SIZE) == 0) {
			*indx = i;
			ret = 1;
		}
	}
	ReleaseNWSLock(&lock);

	return ret;
}

/* 
 * Generates a new nexthost list. We don't want to do experiments always
 * in the same order (a bad host early in the list will hinder our
 * experiments). This function generates a new sequence for #info#. The
 * sequence has info->myIndex in the first place.
 */
static void
CreateNewHostList(	TokenInfo *info) {
	unsigned int i, j;
	Clique *clique;

	/* alias */
	clique = &info->clique;

	/* let's make room for the list */
	if (info->list == NULL) {
		info->list = (unsigned int *)MALLOC(sizeof(unsigned int) * clique->count);
		if (info->list == NULL) {
			ABORT("CreateNewHostList: out of memory\n");
		}

		/* let's initialize the seed */
		srand((unsigned int)CurrentTime());

		/* last slot is always taken by myIndex */
		info->list[clique->count - 1] = info->myIndex;
	}

	/* let's void the previous list */
	for (i = 0; i < clique->count - 1; i++) {
		info->list[i] = clique->count;
	}

	/* now let's create the new list: we have 0 to count -1 hosts */
	for (i = 0; i < clique->count; i++) {
		/* i'm already set: skip my index */
		if (i == info->myIndex) {
			continue;
		}

		/* we don't need any strong randomize here */
		j = (unsigned int )rand() % (clique->count - 1);
		while (info->list[j] != clique->count) {
			j = (j + 1) % (clique->count - 1);
		}
		info->list[j] = i;
	}

	/* now let's reset the start of the list */
	info->nextHost = 0;
}


/*
 * Adds #newClique# to the list of known cliques and return in #indx# the
 * index where to find the tokenInfo for this clique. Returns 1 on
 * success or 0 if the clique already exists and -1 if we are not in the
 * clique. 
 */
static int
AddNewClique(	Clique *newClique,
		unsigned int *indx) {
	unsigned int myIndex;
	TokenInfo *info;
	NWSAPI_Measurement m;
	int i;

	/* let's find the clique */
	if (FindTokenInfo(newClique->name, indx)) {
		return 0;
	}

	if (!FindMe(newClique, &myIndex)) {
		/* we don't have business here */
		return -1;
	}

	/* good we need to add the clique */

	/* first of all let's find the skill we are dealing with */
	for (i = 0; i < SKILL_COUNT; i++) {
		if (strcmp(newClique->skill, SkillName((KnownSkills)i)) == 0) {
			break;
		}
	}
	if (i == SKILL_COUNT) {
		/* we really shouldn't be here! */
		ERROR1("AddNewClique: unknown clique skill %s\n", newClique->skill);
		return -1;
	}

	GetNWSLock(&lock);
	cliquesKnown++;
	cliqueTokens = REALLOC(cliqueTokens, sizeof(TokenInfo) * cliquesKnown);
	if (cliqueTokens == NULL) {
		ABORT("AddNewClique out of memory\n");
	}
	info = &cliqueTokens[cliquesKnown - 1];
	info->clique = *newClique;
	info->skill = (KnownSkills) i;
	info->delegatePid = 0;
	info->expectedCycleTime = NWSAPI_NewForecastState();
	if (info->expectedCycleTime == NULL) {
		ABORT("out of memory\n");
	}
	/* we fake a period measurement to avoid nasty numbers at the
	 * first query */
	m.timeStamp = CurrentTime();
	m.measurement = newClique->period;
	NWSAPI_UpdateForecast(info->expectedCycleTime, &m, 1);

	info->state = TOKEN_HELD;
	info->list = NULL;
	info->myIndex = myIndex;
	info->cycleStartTime = 0;
	info->nextRunTime = 0.0;
	info->lastMeasurement = 0.0;

	/* let's reset the new sequence */
	CreateNewHostList(info);

	*indx = cliquesKnown - 1;
	ReleaseNWSLock(&lock);
	INFO1("AddNewClique: created clique %s\n", newClique->name);

	return 1;
}


/*
 * Receives a Clique struct from #sd# into #whereTo# and converts all the host
 * addresses in it to network format (see NOTE at top of module).  Returns 1 if
 * successful within #timeOut# seconds, else 0.
 */
static int
RecvClique(	Socket sd,
		Clique *c,
		double tout) {
	int i;

	if(!RecvData(sd, c, cliqueDescriptor, cliqueDescriptorLength, tout)) {
		FAIL("RecvClique: data receive failed\n");
	}
	for(i = 0; i < c->count; i++) {
		c->members[i].address = htonl(c->members[i].address);
	}

	return 1;
}

/*
 * Sends #message# accompanied by #cliqueD# on #sd# after first converting all
 * the host addresses to host format (see note at top of module).  Returns 1 if
 * successful within #timeOut# seconds, else 0.
 */
static int
SendClique(	Socket sd,
		MessageType message,
		const Clique *cliqueD,
		double tout) {
	Clique c;
	int i;

	c = *cliqueD;
	if (htonl(1) != 1) {
		/* Translation needed. */
		for(i = 0; i < cliqueD->count; i++) {
			c.members[i].address = ntohl(c.members[i].address);
		}
	}

	return SendMessageAndData(sd, message, &c, cliqueDescriptor, cliqueDescriptorLength, tout);
}


/*
 * Attempts to send #message#, accompanied by #cliqueD#, to each
 * participating host in turn except the #myIndex#'th one (assumed to be
 * this sensor). It forks to net get the process stuck on unresponsive
 * hosts. If #broadcast# send a message to all the members. 
 */
static void
SendToken(	int message,
		const Clique *cliqueD,
		unsigned int myIndex,
		int broadcast) {
	IPAddress addr;
	unsigned int i, ret;
	Socket sd;
	char *name;
	pid_t pid;

	/* if we are forking, we fork here too to avoid to get stuck
	 * sending the token */
	if (doFork) {
		if (!CreateLocalChild(&pid, NULL, NULL)){
			WARN("SendToken: fail to fork!\n");
		} else {
			if (pid > 0) {
				/* parent is done */
				return;
			}
			/* child is here */
		}
	}

	/* let's forward it to the first guy that answer after us */
	i = (myIndex + 1) % cliqueD->count;
	for (; i != myIndex; i = (i + 1) % cliqueD->count) {
		MessageHeader header;

		addr.addr = cliqueD->members[i].address;

		/* let's try to send the token */
		ret = 0;
		if (CallAddr(addr, cliqueD->members[i].port, &sd, -1)
				&& SendClique(sd, message, cliqueD, -1)) {
			if (message == CLIQUE_TOKEN_FWD) {
				if (!RecvHeader(sd, &header, -1)) {
					WARN("SendToken: failed to receive essage\n");
				} else if (header.message == CLIQUE_FAILED) {
					ERROR("SendToken: receive CLIQUE_FAILED\n");
				} else if (header.message != CLIQUE_ACK) {
					ERROR("SendToken: receive unknown message\n");
				} else {
					/* cool */
					ret =1;
				}
			} else {
				ret = 1;
			}
		}
		DROP_SOCKET(&sd);

		/* now if we don't want to broadcast we just need
		 * to have talked to a single host so we are done */
		if (ret && !broadcast) {
			break;
		} else if (!ret) {
			name = IPAddressMachine_r(addr);
			if (name == NULL) {
				WARN("SendToken: failed to resolve name!\n");
				name = strdup("noname");
				if (name == NULL) {
					ABORT("SendToken: out of memory\n");
				}
			}
			WARN2("SendToken: failed to send token to %s:%d\n", name, cliqueD->members[i].port);
			FREE(name);
		}
	}

	if (doFork) {
		exit(1);
	}

	return;
}


/* Registers the clique in #info# with the name server.  */
static void
RegisterClique(TokenInfo *info) {
	int i, j;
	struct host_cookie member;
	char *opts, res[EXP_LIST_SIZE], *tmp, skillName[EXP_NAME_SIZE];
	const MeasuredResources *resources;
	Object toRegister;
	IPAddress addr;
	NWSAPI_ForecastCollection forecast;

	/* Only the clique leader un/registers the clique. */
	if (info->clique.leader != info->myIndex || info == NULL) {
		return;
	}

	/* get the resources */
	res[0] = '\0';
	if (!SkillResources(info->skill, &resources, &j)) {
		ERROR("RegisterClique: failed to get info on skill\n");
		return;
	}
	for(i = 0; i < j; i++) {
		if (sizeof(res) < (strlen(res) + strlen(ResourceName(resources[i])) + 1)) {
			WARN("RegisterClique: activity.resources is too small!\n");
			break;
		}
		if(i > 0) {
			strcat(res, "\t");
		}
		strcat(res, ResourceName(resources[i]));
	}

	/* get the skill name */
	SAFESTRCPY(skillName, info->clique.skill);

	/* get the options */
	i = 30 + strlen(info->clique.options);	/* size of the string
						   (let's be generous:
						   we'll realloc anyway) */
	opts = (char *)MALLOC(sizeof(char) * i);
	if (opts == NULL) {
		ABORT("RegisterClique: out of memory\n");
	}
	if (info->clique.options[0] == '\0') {
		sprintf(opts, "period:%d\tmember:", (int)info->clique.period);
	} else {
		sprintf(opts, "%s\tperiod:%d\tmember:", info->clique.options, (int)info->clique.period);
	}

	for(i = 0; i < info->clique.count; i++) {
	     	char s[MAX_HOST_NAME];

		addr.addr = info->clique.members[i].address;
		tmp = IPAddressMachine_r(addr);
		if (tmp == NULL) {
			WARN("RegisterClique: cannot resolve to name: trying to use IP address.\n");
			tmp = IPAddressImage_r(addr);
			if (tmp == NULL) {
				ERROR("RegisterClique: cannot resolve to IP address: removing host!\n");
				continue;
			}
		}
		Host2Cookie(tmp, info->clique.members[i].port, &member);
		FREE(tmp);
		vstrncpy(s, sizeof(s), 2, (i > 0) ? "," : "", HostCImage(&member));
		opts = (char *) REALLOC(opts, strlen(opts) + strlen(s) + 1);
		if (opts == NULL) {
			ABORT("RegisterClique: out of memory\n");
		}
		strcat(opts, s);
	}

	/* let's force an unregistration */
	UnregisterObject(info->clique.name);

	/* let's register now: RegisterObject takes care of duplicates */
	toRegister = CreateActivityObject(info->clique.name, 
			CLIQUE_CONTROL_NAME,
			EstablishedRegistration(),
			opts,
			res,
			skillName);

	/* as of version 2.10 we try to print the expected cycle time
	 * (to have an indea of the current period (mainly if it's too
	 * long there will be collisions) */
	NWSAPI_ComputeForecast(info->expectedCycleTime, &forecast);
	sprintf(res, "%.0f", forecast.forecasts[NWSAPI_MSE_FORECAST].forecast);
	AddNwsAttribute(&toRegister, "cycleTime", res);

	RegisterObject(toRegister);
	FreeObject(&toRegister);
	FREE(opts);
}


/*
 * Handles the recovery of the clique token #info# from a child process that
 * was delegated to conduct the experiment.
 */
static void
ActivateToken(TokenInfo *info) {
	/* If delegatePid is already zero, then we already recovered this
	 * token, so no action is needed.  Otherwise, zero the pid and
	 * set up for an immediate experiment with the next host. */
	if (info->delegatePid != 0) {
		info->delegatePid = 0;

		/* we are doing experiment when we are in TOKEN_HELD and
		 * TOKEN_PASSED */
		if (info->state == TOKEN_HELD || info->state == TOKEN_PASSED) {
			info->nextRunTime = 0.0;
		}
	}
}


/* Let's recover a possible runaway child. Returns 1 if a child is
 * terminated 0 otherwise. */
static int
RecoverChild(	TokenInfo *info) {

	if (info->delegatePid != 0) {
		/* we are conducting experiments and the child
		 * took too long to come back: time to kill it */
		WARN2("RecoverChild: wresting clique %s away from child %d\n", info->clique.name, info->delegatePid);
		(void)kill(info->delegatePid, SIGKILL);
		info->delegatePid = 0;

		return 1;
	}

	return 0;
}

/*
 * The token timed out: time for drastic measures.
 */
static void
AdoptToken(	TokenInfo *info) {
	Clique *clique = &info->clique;

	LOG1("AdoptToken: taking leadership for clique %s\n", info->clique.name);
	clique->leader = info->myIndex;
	clique->instance += info->myIndex;
	info->nextRunTime = 0.0;
	info->cycleStartTime = 0;
	info->lastMeasurement = 0;

	/* we hold the token now */
	info->state = TOKEN_HELD;

	/* let's regenerate the host list and start all over */
	CreateNewHostList(info);

	/* recoved any child */
	RecoverChild(info);

	/* we regenerated the token: re-register */
	RegisterClique(info);

}


/*
 * A "local" function of HandleCliqueMessage().  Handles the forwarding
 * of a token.
 */
static int
DoTokenRecv(Clique *recvdClique) {
	TokenInfo *info;
	int i;
	unsigned int tokenIndex;
	char name[MAX_CLIQUE_NAME_SIZE];

	/* let's try to add this clique to the known cliques */
	i = AddNewClique(recvdClique, &tokenIndex);
	if (i == -1) {
		WARN1("DoTokenRecv: not in clique %s\n", recvdClique->name);
		return 0;  
	}

	GetNWSLock(&lock);
	info = &cliqueTokens[tokenIndex];

	/* now, if it's an old clique I need to check it */
	if (i == 0) {
		/* first of all let's see if the clique has been stopped
		 * and someone still has the token circulating. I can
		 * tell that by the generation time of the clique: every
		 * time it is (re)started the time will be updated */
		if (info->clique.whenGenerated >= recvdClique->whenGenerated 
				&& info->state == TOKEN_STOPPED) {
			SAFESTRCPY(name, recvdClique->name);
			ReleaseNWSLock(&lock);
			StopCliqueActivity(name);
			return 1;
		}

		/* now let's see if the received clique is valid. */
		if (info->clique.whenGenerated > recvdClique->whenGenerated
				|| (info->clique.whenGenerated == recvdClique->whenGenerated && info->clique.instance > recvdClique->instance)
				|| (info->clique.whenGenerated == recvdClique->whenGenerated && info->clique.instance == recvdClique->instance && info->clique.leader > recvdClique->leader)) {
			/* this is a stale token: ignore it */
			ERROR5("DoTokenRecv: stale %s token; %f:%f vs. %f:%f\n", recvdClique->name,recvdClique->whenGenerated,recvdClique->instance, info->clique.whenGenerated, info->clique.instance);
			ReleaseNWSLock(&lock);

			return 1;
		}

		/* now, let's see if the token is a newer than what we
		 * knew */
		if (info->clique.whenGenerated != recvdClique->whenGenerated ||
				info->clique.instance != recvdClique->instance 
				|| info->clique.leader != recvdClique->leader) {
			LOG1("DoTokenRecv: received newer token for clique %s.\n", recvdClique->name);
			info->cycleStartTime = 0;
			info->lastMeasurement = 0;

			/* in this case we need to restart everything */
			RecoverChild(info);
		}

		/* update the clique */
		info->clique = *recvdClique;
	}

	/* we received the token so let's schedule some work:
	 * RunCliqueJob will take care of what to do with this token. */
	info->state = TOKEN_HELD;

	/* we might have already started the experiments (say the token
	 * is late): in this case we let it finish on its own) */
	if (info->delegatePid == 0) {
		info->nextRunTime = 0;
	}

	/* let's be sure we have it registered */
	RegisterClique(info);
	ReleaseNWSLock(&lock);

	return 1;
}

static int
Membership(	char *cliqueName, 
		char *toAdd,
		int join) {
	unsigned int i, myIndex;
	TokenInfo *info;
	IPAddress addr;
	struct host_cookie tmp;
	
	/* sanity check */
	if (cliqueName == NULL || toAdd == NULL) {
		ERROR("Membership: Invalid parameter\n");
		return 0;
	}

	/* finding the clique */
	if (!FindTokenInfo(cliqueName, &i)) {
		ERROR1("Membership: clique %s not known\n", cliqueName);
		return 0;
	}

	/* finding the host address */
	if (!Host2Cookie(toAdd, DefaultHostPort(SENSOR_HOST), &tmp)) {
		ERROR1("Membership: I don't understand hostname %s\n", toAdd);
		return 0;
	}
	if (!IPAddressValue(tmp.name, &addr)) {
		ERROR1("Membership: cannot translate hostname %s\n", toAdd);
		return 0;
	}

	/* check if we can talk to the sensor */
	tmp.sd = NO_SOCKET;
	if (join && !ConnectToHost(&tmp, NULL)) {
		ERROR1("Membership: cannot talk to %s\n", toAdd);
		return 0;
	}
	DisconnectHost(&tmp);


	/* we are working on cliqueTokens for a bit, so we need to lock
	 * the whole thing down */
	GetNWSLock(&lock);
	info = &cliqueTokens[i];		/* alias */
	if (!FindMe(&info->clique, &myIndex)) {
		ERROR1("Membership: I'm not part of clique %s\n", cliqueName);
		ReleaseNWSLock(&lock);
		return 0;
	}

	/* check if the address is in there */
	for (i = 0; i < info->clique.count; i++) {
		if (info->clique.members[i].address == addr.addr &&
			info->clique.members[i].port == tmp.port) {
			break;
		}
	}
	if (join && (i < info->clique.count || info->clique.count >= MAX_MEMBERS)) {
		if (i < info->clique.count) {
			ERROR("Membership: member already present\n");
		} else {
			ERROR("Membership: reached max numbers of members\n");
		}
		ReleaseNWSLock(&lock);
		return 0;
	} else if (!join && i >= info->clique.count) {
		ERROR("Membership: member not present\n");
		ReleaseNWSLock(&lock);
		return 0;
	}

	/* we got here: add or remove the member */
	if (join) {
		/* add the member to the clique */
		info->clique.members[info->clique.count].address = addr.addr;
		info->clique.members[info->clique.count].port = tmp.port;
		info->clique.count++;
	} else {
		/* remove the member: i is the index */
		for (; (i+1) < info->clique.count; i++) {
			info->clique.members[i].address = info->clique.members[i+1].address;
			info->clique.members[i].port =info->clique.members[i+1].port;
		}
		info->clique.count--;
	}

	/* we consider this a restart of the clique so we re-generate the
	 * time */
	info->clique.whenGenerated = CurrentTime();
	info->clique.instance = 1;
	AdoptToken(info);
	ReleaseNWSLock(&lock);
	
	return 1;
}

/*
** Serves as the listener for clique-specific messages.
*/
static void
HandleCliqueMessage( 	Socket *sd,
			MessageHeader header) {
	char *data,
		*cliqueName,
		*hostName;
	DataDescriptor descriptor = SIMPLE_DATA(CHAR_TYPE, 0);
	DataDescriptor measDescriptor = SIMPLE_DATA(DOUBLE_TYPE, 3);
	double d[3];
	NWSAPI_Measurement toStore;
	Clique clique;
	unsigned int cliqueIndex;

	switch(header.message) {
	case CLIQUE_ACTIVATE:
	case CLIQUE_DIE:
	case CLIQUE_TOKEN_FWD:
		if(!RecvClique(*sd, &clique, -1)) {
			DROP_SOCKET(sd);
			ERROR("HandleCliqueMessage: receive failed\n");
			return;
		}

		switch(header.message) {
		case CLIQUE_ACTIVATE:
			if (!FindTokenInfo(clique.name, &cliqueIndex)) {
				ERROR1("HandleCliqueMessage: unknown clique %s\n", clique.name);
			} else {
				GetNWSLock(&lock);
				ActivateToken(&cliqueTokens[cliqueIndex]);
				ReleaseNWSLock(&lock);
			}
			break;

		case CLIQUE_DIE:
			StopCliqueActivity(clique.name);
			break;

		case CLIQUE_TOKEN_FWD:
			if (DoTokenRecv(&clique)) {
				if (!SendMessage(*sd, CLIQUE_ACK, -1)) {
					WARN("HandleCliqueMessage: ack failed\n");
				}
			} else {
				SendMessage(*sd, CLIQUE_FAILED, -1);
			}
			break;
		}

		break;

	case CLIQUE_EXPERIMENT:
		/* the first data are the measurement/value pair and the
		 * lenght of the series registration */
		if(!RecvData(*sd, d, &measDescriptor, 1, -1)) {
			ERROR("HandleCliqueMessage: data receive failed\n");
			DROP_SOCKET(sd);
		} else {
			/* series registration will be here */
			descriptor.repetitions = (int)d[2];
			data = (char *)MALLOC(descriptor.repetitions + 1);
			if(data == NULL) {
				ERROR("HandleCliqueMessage: out of memory\n");
				DROP_SOCKET(sd);
				break;
			} 
			if(!RecvData(*sd, data, &descriptor, 1, -1)) {
        			ERROR("HandleCliqueMessage: receive failed\n");
				DROP_SOCKET(sd);
			} else {
				/* terminate the string */
				data[descriptor.repetitions] = '\0';

				/* got them: set the Experiment */
				toStore.timeStamp = d[0];
				toStore.measurement = d[1];

				/* register the experiment */
				RegisterExperiment(data, &toStore);
			}
			FREE(data);
		}
    		break;

	case CLIQUE_NEW_MEMBER:
	case CLIQUE_REMOVE_MEMBER:
		descriptor.repetitions = header.dataSize;
		data = (char *)MALLOC(header.dataSize + 1);
		if (data == NULL) {
        		ERROR("HandleCliqueMessage: receive failed\n");
			DROP_SOCKET(sd);
			return;
		}

		if(!RecvData(*sd, data, &descriptor, 1, -1)) {
        		ERROR("HandleCliqueMessage: receive failed\n");
			DROP_SOCKET(sd);
		} else {
			/* NULL terminate the srting */
			data[header.dataSize] = '\0';
			
			cliqueName = data;
			hostName = cliqueName + strlen(cliqueName) + 1;

			if (Membership(cliqueName, hostName, (header.message == CLIQUE_NEW_MEMBER))) {
				SendMessage(*sd, CLIQUE_ACK, -1);
			} else {
				SendMessage(*sd, CLIQUE_FAILED, -1);
			}

		}
		FREE(data);
		break;

	default:
		ERROR("HandleCliqueMessage: unrecognized command\n");
		DROP_SOCKET(sd);
		break;
	}

	/* done with the socket */
	if (*sd != NO_SOCKET) {
		SocketIsAvailable(*sd);
	}
}

static int
CliqueMembership(	struct host_cookie *cookie,
			const char *cliqueName,
			const char *hostName,
			int join,
			int tout) {
	DataDescriptor descriptor = SIMPLE_DATA(CHAR_TYPE, 0);
	MessageType type;
	MessageHeader header;
	char *data;
	int ret = 1;

	/* sanity check */
	if (cliqueName == NULL || hostName == NULL || cookie == NULL) {
		ERROR("CliqueMembership: NULL parameters\n");
		return 0;
	}

	/* prepare the data to send */
	data = (char *)MALLOC(strlen(cliqueName) + strlen(hostName) + 1);
	if (data == NULL) {
		ERROR("CliqueMembership: out of memory\n");
		return 0;
	}
	descriptor.repetitions = sprintf(data, "%s\t%s", cliqueName, hostName);
	descriptor.repetitions++;
	data[strlen(cliqueName)] = '\0';

	/* join or leave the club */
	if (join) {
		type = CLIQUE_NEW_MEMBER;
	} else {
		type = CLIQUE_REMOVE_MEMBER;
	}

	/* let see if we can talk to the guy */
	if (!ConnectToHost(cookie, NULL)) {
		ret = 0;
		ERROR("CliqueMembership: cannot talk to master\n");
	} else if (!SendMessageAndData(cookie->sd,
				type,
				data,
				&descriptor,
				1,
				tout)) {
		ret = 0;
		ERROR("CliqueMembership: cannot send applications\n");
	} else if (!RecvHeader(	cookie->sd, 
				&header,
				-1)) {
		ERROR("CliqueMembership: failed to receive ack message\n");
		ret = 0;
	} else {
		/* finally let's check the message we got back */
		if (header.message == CLIQUE_FAILED) {
			ERROR("CliqueMembership: receive NACK message\n");
			ret = 0;
		} else if (header.message != CLIQUE_ACK) {
			ERROR("CliqueMembership: receive unknown message\n");
			ret = 0;
		}
	}

	return ret;
}

int
CliqueJoin(	struct host_cookie *cookie,
		const char *cliqueName,
		const char *hostName,
		int tout) {
	return CliqueMembership(cookie, cliqueName, hostName, 1, tout);
}

int
CliqueRemove(	struct host_cookie *cookie,
		const char *cliqueName,
		const char *hostName, 
		int tout) {
	return CliqueMembership(cookie, cliqueName, hostName, 0, tout);
}


static int
ResolveCliqueMembers(	const char *name) {
	char filter[255 + 1];
	ObjectSet objs;
	char *tmp, *c;
	const char *member;

	objs = NewObjectSet();

	/* create the filter */
	snprintf(filter, 255, "name=%s", name);
	if (!RetrieveFromMyNameserver(filter, &objs)) {
		INFO1("ResolveCliqueMembers: cannot find clique %s\n", name);
		FreeObjectSet(&objs);
		return 0;
	}

	/* get the member options */
	tmp = NwsAttributeValue_r(FindNwsAttribute(NextObject(objs, NULL), "member"));
	if (tmp == NULL) {
		INFO1("ResolveCliqueMembers: cannot find members for %s\n", name);
		FreeObjectSet(&objs);
		return 0;
	}
	FreeObjectSet(&objs);

	/* let's resolve the name so we can cache it */
	for (member = tmp; GETTOK(filter, member, ",", &member); ){
		c = strchr(filter, ':');
		if (c) {
			*c = '\0';
		}
		IsValidIP(filter);
	}
	FREE(tmp);

	/* we've done our best*/
	return 1;
}
	



/*
 * A "local" function of RunCliqueJob().  Performs the experiment specified by
 * #info#. It will do the experiments with info->nextHost.  Updates
 * #info#. Returns 1 if successful, else 0.
 */
static int
RunCliqueExperiment(	TokenInfo *info) {
	Socket childToParent;
	Clique *clique;
	NWSAPI_Measurement toStore;
	int i, 
		resultsLength,
		timeOut,
		ret;
	char *options, *tmp;
	DataDescriptor descriptor = SIMPLE_DATA(CHAR_TYPE, 0);
	DataDescriptor measDescriptor = SIMPLE_DATA(DOUBLE_TYPE, 3);
	SkillResult *results = NULL;
	Object toRegister;
	IPAddress addr;
	double d[3];

	/* we don't do experiment with ourselves */
	if (info->list[info->nextHost] == info->myIndex) {
		WARN1("RunCliqueExperiment: clique %s called with my index\n", clique->name);
		return 0;
	} 

	clique = &info->clique;			/* alias */

	/* let's build the option with the member we want to do
	 * experiments with */
	options = strdup(clique->options);
	if (options == NULL) {
		ERROR("RunCliqueExperiment: out of memory\n");
		return 0;
	}

	/* now, here we do something weird: we try to work around broken
	 * DNS (the one that doesn't have reverse DNS). If we cannot
	 * resolve to a name, we search for the clique name in the
	 * nameserver and we resolve the members: this measn that we'll
	 * have now a cache with the right names<->address combination
	 * and we should be able to go. If we fail doing that, we just
	 * complain and resolve to IP */
	for (i = 0; i <= 1; i++) {
		addr.addr = clique->members[info->list[info->nextHost]].address;
		tmp = IPAddressMachine_r(addr);
		if (tmp != NULL) {
			/* everything is cool */
			break;
		}
		/* let's look try to resolve the right way the names and
		 * repeat */
		if (!ResolveCliqueMembers(clique->name)) {
			/* we couldn't find anything */
			break;
		}
	}

	/* if we are desperate we resolve to IP */
	if (tmp == NULL) {
		WARN("RunCliqueExperiment: cannot resolve to name: trying IP address!\n");
		tmp = IPAddressImage_r(addr);
		if (tmp == NULL) {
			ERROR("RunCliqueExperiment: cannot resolve address.\n");
			FREE(options);
			return 0;
		}
	}

	/* allocate space for target:name:port */
	options = (char *)REALLOC(options, strlen(options) + strlen(tmp) + 14);
	if (options == NULL) {
		ERROR("RunCliqueExperiment: out of memory\n");
		FREE(tmp);
		return 0;
	}
	if (options[0] != '\0') {
		strcat(options, "\t");
	}
	sprintf(options + strlen(options), "target:%s:%ld", tmp, clique->members[info->list[info->nextHost]].port);
	FREE(tmp);

	/* and now add the fork options if not specified */
	tmp = GetOptionValue(options, "fork", NULL);
	if (tmp == NULL) {
		options = (char *)REALLOC(options, strlen(options) + 10);
		if (options == NULL) {
			ERROR("RunCliqueExperiment: out of memory\n");
			return 0;
		}
		sprintf(options + strlen(options), "\tfork:%s", (doFork) ? "yes" : "no");
	}
	FREE(tmp);

	/* let's fork if we've been asked to */
	if (doFork) {
		if(!CreateLocalChild(&info->delegatePid, NULL, &childToParent)) {
			FREE(options);
			FAIL1("RunCliqueExperiment: fork for clique %s failed\n", clique->name);
		}

		if(info->delegatePid > 0) {
			/* Parent process. */
			FREE(options);
			return 1;
		}
		/* Child process. */
		/* now add the socket for the child to parent communication */
		options = (char *)REALLOC(options, strlen(options) + 20);
		if (options == NULL) {
			ERROR("RunCliqueExperiment: out of memory\n");
			return 0;
		}
		sprintf(options + strlen(options), "\tchildToParent:%d", childToParent);
 	 }


	timeOut = (unsigned int)(clique->period) - 1;
	UseSkill(info->skill, options, timeOut, &results, &resultsLength);
	FREE(options);
	toStore.timeStamp = CurrentTime();

	ret = 0;
	for(i = 0; i < resultsLength; i++) {
		if(results[i].succeeded) {
			toStore.measurement = results[i].measurement;
			toRegister = CreateSeriesObject(NULL, 
					EstablishedRegistration(),
					ResourceLabel(results[i].resource),
					"noname",
					results[i].options,
					ResourceName(results[i].resource),
					clique->name);

			if (doFork) {
				/* let's tell the parent to store the
				 * experiment: the parent knows which
				 * memory is active right now, so it will
				 * do the right thing */
				d[0] = toStore.timeStamp;
				d[1] = toStore.measurement;
				/* this is a trick: to keep the code
				 * simple we put the lenght of the series
				 * registration as 3 values here */
				d[2] = strlen(toRegister);

				/* Send the registration request
				 * back to the parent process. */
				descriptor.repetitions = (int)d[2];
				if (!SendMessageAndDatas(childToParent,
						  CLIQUE_EXPERIMENT,
						  d,
						  &measDescriptor,
						  1,
						  toRegister,
						  &descriptor,
						  1,
						  0)) {
					ERROR("RunCliqueExperiment: failed to talk to parent\n");
				}
			} else {
				RegisterExperiment(toRegister, &toStore);
			}
			FreeObject(&toRegister);

			/* save the result for the exit status */
			ret = 1;
		} else {
			WARN1("RunCliqueExperiment: %s failed\n",results[i].options);
		}
	}
	FreeSkillResults(resultsLength, &results);

	if (doFork) {
		/* Send reactivate message. */
		(void)SendClique(childToParent, CLIQUE_ACTIVATE, clique, -1);
		DROP_SOCKET(&childToParent);
		exit(ret);
	}

	return ret;
}


/*
 * Performs one experiment indicated by #info#, or passes the clique
 * token on if all members in the clique have already been contacted.
 * Updates #info# to reflect the new token status.
 */
static void
RunCliqueJob(TokenInfo *info) {
	Clique *clique;
	double now, tmp;
	int indx;
	NWSAPI_Measurement m;
	NWSAPI_ForecastCollection forecast;

	/* sanity check */
	if (info == NULL) {
		WARN("RunCliqueJob: NULL parameter\n");
		return;
	}
	now = CurrentTime();
	clique = &info->clique;

	/* we calculate here the distance from leader */
	if (info->myIndex > clique->leader) {
		indx = info->myIndex - clique->leader;
	} else {
		indx = (clique->count + info->myIndex) - clique->leader;
	}

	switch (info->state) {
	case TOKEN_PASSED:
		/* we got here because of a timeout: we need to check if
		 * the simple timeout, then we just take measurement on
		 * our own. If the bigger timeout popped we need to
		 * regenerate the token (we set this one pretty big so
		 * that we don't overflow the network with token). */

		/* is time to be leader? We calculate this timeout on the
		 * basis of time + clique timeout + the distance from the
		 * leader * timeout/clique.count. We are trying to give
		 * somebody else (the one after the leader) to send a
		 * token successfully. */
		tmp = indx * ((double)clique->timeOut/(double)clique->count);
		if (info->cycleStartTime + clique->timeOut + tmp < now) {
			/* the bigger timeout popped: we need to take
			 * leadership and move on */
			AdoptToken(info);
			WARN1("RunCliqueJob: %s timed out\n", clique->name);

			return;
		} else if (info->nextHost == (clique->count - 1)) {
			/* we are doing experiments on our own and we are
			 * done with one cycle: let's get ready for the
			 * next round */
			CreateNewHostList(info);
			info->nextRunTime = now + clique->period;
			return;
		} else {
			/* well, we are in the middle of doing
			 * experiments and the timeout popped, so we
			 * might need to recover a child */
			RecoverChild(info);
		}

		/* if we are here, we are doing experiments on our own
		 * without token */
		LOG1("RunCliqueJob: doing experiments for clique %s without token\n", clique->name);
		break;

	case TOKEN_HELD:
		/* we need to distinguish if we are here at the beginning
		 * of the cycle.  Only the leader does update the timeout
		 * and expected cycle time. */
		if (info->nextHost == 0 && info->cycleStartTime != 0
				&& info->myIndex == clique->leader) {
			m.timeStamp = now;
			m.measurement = now - info->cycleStartTime;
			NWSAPI_UpdateForecast(info->expectedCycleTime, &m, 1);
			NWSAPI_ComputeForecast(info->expectedCycleTime, &forecast);
			clique->timeOut = forecast.forecasts[NWSAPI_MSE_FORECAST].forecast + 2 * forecast.forecasts[NWSAPI_MSE_FORECAST].error;

			/* just be sure the timeout is still a
			 * resonably long period.  */
			if (clique->timeOut < clique->period * 3) {
				clique->timeOut = clique->period * 3;
			} else if (clique->timeOut > clique->period * 10) {
				clique->timeOut = clique->period * 10;
			}
			DDEBUG3("RunCliqueJob: clique %s has an expected cycle of %.0f (timeout %.0f)\n", clique->name, forecast.forecasts[NWSAPI_MSE_FORECAST].forecast, clique->timeOut);
			
			/* The leader needs to wait till it's
			 * time for the token circulation again. */
			if (info->cycleStartTime + clique->period > now) {
				info->nextRunTime = info->cycleStartTime + clique->period;
				/* since I've already done the
				 * bookeeping, let's skip it next
				 * time */
				info->cycleStartTime = 0;
				return;
			}
		}

		/* let's check if we are at the begining of the cycle, or
		 * if some child needs be recovered */
		if (info->nextHost == 0) {
			/* beginning of cycle and everyone marks the
			 * start time for this cycle */
			info->cycleStartTime = now;
		} else if (RecoverChild(info)) {
			/* let's force a send token: we passed our time
			 * slice for doing experiments */
			info->nextHost = clique->count - 1;
			INFO1("RunCliqueJob: recovered child, skipping remaining experiments for clique %s\n", clique->name);
		}

		/* end of the cycle: we need to send the token */
		if (info->nextHost == (clique->count - 1)) {
			/* time to send the token: we don't care about
			 * the Send results, since we have a timeout to
			 * catch problems. Send can be done
			 * asynchronously as a best effort. */
			SendToken(CLIQUE_TOKEN_FWD, clique, info->myIndex, 0);
			info->state = TOKEN_PASSED;

			/* let's get ready for the next run */
			CreateNewHostList(info);

			/* and let's set a timeout: if this timeout pops
			 * we'll do experiments without regenerating the
			 * token. See TOKEN_PASSED for the timeout to
			 * regenerate the token.  Trying to minimize the
			 * overlap with other fellow members, I take this
			 * timeout as now + period + (period/the distance
			 * from the leader). The rationale is that when I
			 * did my exp at first, the previous member
			 * already had enough time to do experiment, so I
			 * grant a bit more (up until the next period is
			 * up for measurement). */
			tmp = (indx * clique->period)/clique->count;
			info->nextRunTime = now + clique->period + tmp;

			/* no need for experiments now */
			return;
		}

		/* now it is time to do measurments. */
		break;

	default:
		WARN1("RunCliqueJob: wrong token for clique %s\n", clique->name);
		return;
	}

	/* we need to do experiments: we do them one at a time, because
	 * we want the results at each step. */

	/* now let's see if this clique is too early */
	if (info->nextHost == 0) {
		/* let's record the time of this measurement: we record
		 * only the first host we try. We give a few s slack to
		 * account for across the second measurment on different
		 * host. */
		if (info->lastMeasurement + clique->period > now + 1){
			WARN1("RunCliqueJob %s is early\n", clique->name);

			/* let's punt to the right time */
			info->nextRunTime = info->lastMeasurement + clique->period + 1;
			return ;
		}
		info->lastMeasurement = now;
	}

	/* finally run the experiment and update nextHost */
	RunCliqueExperiment(info);
	info->nextHost++;

	/* let's set a default timeout, only if we are forking, to
	 * recover a child. */
	if (doFork) {
		info->nextRunTime = info->lastMeasurement + clique->period;
	}

	return;
}



void
CliqueChildDeath(int pid) {
	int i;

	GetNWSLock(&lock);
	for(i = 1; i < cliquesKnown; i++) {
		if(cliqueTokens[i].delegatePid == pid) {
			ActivateToken(&cliqueTokens[i]);
		}
	}
	ReleaseNWSLock(&lock);
}


int
CliqueInit(const char *options) {
	static int initialized = 0;
	int i;
	KnownSkills skill;
	Object toRegister;
	char *tmp, opts[EXP_LIST_SIZE], skills[EXP_LIST_SIZE];

	/* initialize only once */
	GetNWSLock(&lock);
	if (initialized) {
		ReleaseNWSLock(&lock);
		ERROR("CliqueInit: you can initialize only once\n");
		return 0;
	}
	initialized = 1;
	ReleaseNWSLock(&lock);

	/* let's find out if we are forking */
	tmp = GetOptionValue(options, "fork", NULL);
	if (tmp == NULL || strncmp(tmp, "yes", (strlen(tmp) > 3 ? 3 : strlen(tmp))) == 0) {
		doFork = 1;
	} else {
		doFork = 0;
	}
	FREE(tmp);

	/* Register listeners for all clique messages. */
	RegisterListener(CLIQUE_ACTIVATE, "CLIQUE_ACTIVATE", HandleCliqueMessage);
	RegisterListener(CLIQUE_DIE, "CLIQUE_DIE", HandleCliqueMessage);
	RegisterListener(CLIQUE_TOKEN_FWD, "CLIQUE_TOKEN_FWD", HandleCliqueMessage);
	RegisterListener(CLIQUE_EXPERIMENT, "CLIQUE_EXPERIMENT", HandleCliqueMessage);
	RegisterListener(CLIQUE_NEW_MEMBER, "CLIQUE_NEW_MEMBER", HandleCliqueMessage);
	RegisterListener(CLIQUE_REMOVE_MEMBER, "CLIQUE_REMOVE_MEMBER", HandleCliqueMessage);

	/* Register the clique control. */
	skills[0] = '\0';
	for(i = 0; i < SKILL_COUNT; i++) {
		skill = (KnownSkills) i;
		if (SkillAvailableForControl(skill, options, CLIQUE_CONTROL)) {
			if (skills[0] != '\0') {
				strcat(skills, "\t");
			}
			strcat(skills, SkillName(skill));
		}
	}
	sprintf(opts, "member:2_to_%d_sensor\tperiod:0_to_1_int", MAX_MEMBERS);
	toRegister = CreateControlObject(NULL, 
			CLIQUE_CONTROL_NAME,
			EstablishedRegistration(),
			opts,
			skills);
	RegisterObject(toRegister);
	FreeObject(&toRegister);

	return 1;
}


void
CliqueWork(void) {
	int i;

	/* this is very brutal: we need to lock it all! less of copying
	 * the current TokenInfo, we have to be sure there won't be
	 * changes on cliqueTokens */
	GetNWSLock(&lock);
	for(i = 0; i < cliquesKnown; i++) {
		if (cliqueTokens[i].state != TOKEN_STOPPED) {
			if(CurrentTime() >= cliqueTokens[i].nextRunTime) {
				(void)RunCliqueJob(&cliqueTokens[i]);
			}
		}
	}
	ReleaseNWSLock(&lock);
}


double
NextCliqueWork(void) {
	int i;
	double next = 0.0;

	GetNWSLock(&lock);
	for(i = 0; i < cliquesKnown; i++) {
		if (cliqueTokens[i].state != TOKEN_STOPPED 
				&& ((next == 0.0) || (cliqueTokens[i].nextRunTime <= next))) {
			next = cliqueTokens[i].nextRunTime;
		}
	}
	ReleaseNWSLock(&lock);

	return next;
}


int
RecognizedCliqueActivity(const char *name) {
	unsigned int ignored;

	return FindTokenInfo(name, &ignored);
}


int
StartCliqueActivity(const char *registration,
                    const char *skillName,
                    const char *options) {
	Clique clique;
	struct host_cookie host;
	IPAddress hostAddress;
	char hostName[MAX_HOST_NAME];
	int i;
	TokenInfo *info;
	const char *member;
	char *tmp;
	KnownSkills skill;
	unsigned int tokenIndex;

	/* sanity check */
	if (registration == NULL || skillName == NULL) {
		ERROR("StartCliqueActivity: NULL parameter\n");
		return 0;
	}

	/* Figure out what skill we're being asked to exercise. */
	for (i = 0; i < SKILL_COUNT; i++) {
		skill = (KnownSkills) i;
		if (!SkillAvailableForControl(skill, options, CLIQUE_CONTROL)) {
			continue;
		}
		if(strcmp(skillName, SkillName(skill)) == 0) {
			break;
		}
	}
	if (i == SKILL_COUNT) {
		FAIL1("StartCliqueActivity: unsupported skill %s\n", skillName);
	}

	/* let's fill in all the clique details */
	if (strlen(registration) > MAX_CLIQUE_NAME_SIZE) {
		WARN("StartCliqueActivity: clique name too long!\n");
	}
	SAFESTRCPY(clique.name, registration);
	clique.whenGenerated = CurrentTime();
	clique.instance = 1;
	SAFESTRCPY(clique.skill, skillName);
	SkillOptions(skill, options, clique.options, sizeof(clique.options));
	tmp = GetOptionValue(options, "period", DEFAULT_CLIQUE_PERIOD);
	clique.period = strtod(tmp, NULL);
	FREE(tmp);
	clique.timeOut = 3 * clique.period;
	clique.leader = 0;		/* we change to leader to ourselves */
	/* Convert the member roster from strings to address/port pairs. */
	clique.count = 0;
	tmp = GetOptionValue(options, "member", "");
	for(member = tmp; GETTOK(hostName, member, ",", &member);) {
		Host2Cookie(hostName, DefaultHostPort(SENSOR_HOST), &host);
		if(IPAddressValue(host.name, &hostAddress)) {
			/* if it's me, I'll use my best IP (perhaps given
			 * to me with -a command line option) */
			if (EstablishedInterface(hostAddress, host.port)) {
				/* it's me: get the preferred one */
				hostAddress = PreferredInterface();
				/* since I'm starting I assume leadership */
				clique.leader = clique.count;
			}
			clique.members[clique.count].address = hostAddress.addr;
			clique.members[clique.count].port = host.port;
			clique.count++;
		} else {
			WARN1("StartCliqueActivity: conversion of %s failed\n", hostName);
		}
	}
	FREE(tmp);

	/* let's check that we have at least 2 members here */
	if (clique.count < 2) {
		FAIL1("StartCliqueActivity: not enough members for %s\n", clique.name);
	}

	/* now let's try to add this clique to the known cliques */
	i = AddNewClique(&clique, &tokenIndex);
	if (i == -1) {
		INFO1("StartCliqueActivity: not in clique %s\n", clique.name);
		return 0;
	}

	GetNWSLock(&lock);
	info = &cliqueTokens[tokenIndex];

	if (i == 0) {
		/* if the clique is old and it's not stopped (or I was
		 * not a member) then complain */
		if (info->state != TOKEN_STOPPED) {
			ReleaseNWSLock(&lock);
			ERROR1("StartCliqueActivity: %s is already running\n", registration);
			return 0;
		}

		/* update the clique */
		info->clique = clique;
	}

	/* let's be sure we have it registered */
	RegisterClique(info);

	/* RunCliqueJob will start the clique right away */
	ReleaseNWSLock(&lock);

	return 1;
}


int
StopCliqueActivity(const char *reg) {
	unsigned int tokenIndex;
	double now;
	TokenInfo *info;

	/* sanity check */
	if (reg == NULL) {
		ERROR("StopCliqueActivity: NULL parameter\n");
		return 0;
	}

	if(reg[0] == '\0') {
		/* we are probably shutting down: don't lock! */
		for(tokenIndex = 0; tokenIndex < cliquesKnown; tokenIndex++) {
			INFO1("StopCliqueActivity: stopping clique %s\n", cliqueTokens[tokenIndex].clique.name);
			cliqueTokens[tokenIndex].state = TOKEN_STOPPED;
			UnregisterObject(cliqueTokens[tokenIndex].clique.name);
		}

		return 1;
	}

	if (!FindTokenInfo(reg, &tokenIndex)) {
		FAIL1("StopCliqueActivity: unknown clique %s", reg);
	}

	/* kinda sucks: we need to look across a send */
	now = CurrentTime();
	GetNWSLock(&lock);
	info = &cliqueTokens[tokenIndex];	
	INFO1("StopCliqueActivity: stopping clique %s\n", info->clique.name);
	info->state = TOKEN_STOPPED;

	/* we send a CLIQUE_DIE only once per clique period */
	if (info->cycleStartTime < now) {
		DDEBUG1("StopCliqueActivity: sending CLIQUE_DIE for %s\n", reg);
		SendToken(CLIQUE_DIE, &info->clique, info->myIndex, 1);
		info->cycleStartTime = now + info->clique.period;
	}
	ReleaseNWSLock(&lock);
	UnregisterObject(reg);

	return 1;
}
