/*
  Copyright Mission Critical Linux, 2000

  Kimberlite is free software; you can redistribute it and/or modify it
  under the terms of the GNU General Public License as published by the
  Free Software Foundation; either version 2, or (at your option) any
  later version.

  Kimberlite is distributed in the hope that it will be useful, but
  WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  General Public License for more details.

  You should have received a copy of the GNU General Public License
  along with Kimberlite; see the file COPYING.  If not, write to the
  Free Software Foundation, Inc.,  675 Mass Ave, Cambridge, 
  MA 02139, USA.
*/
/*
 *  $Id: disklock.c,v 1.7 2000/09/13 19:39:03 burke Exp $
 *
 *  Copyright (C) 2000 Mission Critical Linux, LLC
 *
 *  author: Tim Burke <burke@missioncriticallinux.com>
 *  description: Low level locking primitives.
 *
 * disklock.c
 *
 * This file implements the low-level disk access routines used 
 * to implement the locking mechanism which is used to synchronize
 * access to service descriptions among cluster members.
 */
#include <stdio.h>
#include <unistd.h>
#include <string.h>
#include <fcntl.h>
#include <sys/types.h>
#include <sys/param.h>
#include <sys/stat.h>
#include <sys/wait.h>
#include <sys/time.h>
#include <signal.h>
#include <errno.h>
#include <sys/mman.h>
#include <logger.h>
#include <sys/syslog.h>
#include <clucfg.h>

#include "diskstate.h"
#include "disk_proto.h"

static const char *version __attribute__ ((unused)) = "$Revision: 1.7 $";

/*
 * Forward routine declarations.
 */
static int writeLockBlock(int nodeNum, DiskLockBlock *lkblk);
static int readLockBlock(int nodeNum, DiskLockBlock *lkblk);

static int lockSubsysInitialized = 0;
static int myNodeNum = NODE_ID_NONE;
static int partnerNodeNum = NODE_ID_NONE;

/*
 * .............Configurable Parameters...................
 *
 * The following tuning knobs are intended to allow customization.
 */
/*
 * We tolerate a few IO errors before reacting.
 * This parameter defines how many consecutive errors are needed
 * to declare a true IO failure condition.  It is intended to avoid
 * over-reacting to an intermittent error.
 */
static int max_consecutive_io_errors = MAX_CONSECUTIVE_IO_ERRORS;

/*
 * Called to initialize subsystem state variables.
 * Also opens the file descriptor representing the shared state partition.
 * Note: this can be called multiple times in the course of a running system.
 * Therefore it should not directly modify the on-disk lock state.
 * 
 * Returns: 0 on success.
 */
int initLockSubsys() {
    int nodeNum;

    if (lockSubsysInitialized) {
	return(0);
    }
    initSharedFD();
    nodeNum = cluGetLocalNodeId();
    if (nodeNum < 0) {
	clulog(LOG_ERR, "initLockSubsys: unable to determine my node ID.\n");
	return(-1);
    }

    // Kludge to get partner's node number
    myNodeNum = nodeNum;
    if (myNodeNum == 0) partnerNodeNum = 1;
    else partnerNodeNum = 0;

    lockSubsysInitialized = 1;
    return(0);
}

/*
 * Called to release resources obtained in initLockSubsys.
 * Note: this can be called multiple times in the course of a running system.
 * Therefore it should not directly modify the on-disk lock state.
 * 
 * Returns: 0 on success.
 */
int closeLockSubsys(void) {

    if (lockSubsysInitialized == 0) {
	return(0);
    }
    lockSubsysInitialized = 0;
    return 0;
}

/*
 * Called to initialize a lock block.  
 * During the course of normal operation, various fields within this block
 * will be updated accordingly.
 */
void initLockBlock(DiskLockBlock *lkblk) {
    bzero((void *)lkblk, sizeof(DiskLockBlock));
    lkblk->magic_number = LOCK_BLOCK_MAGIC_NUMBER;
    lkblk->version = LOCK_BLOCK_LATEST_VERSION;
    lkblk->lockData = DISK_LOCK_FREE; 
}

/*
 * Write the lock block out to disk.
 * NOTE: Raw IO operations are required to be 512 byte aligned and of
 * lengths which are a multiple of 512 bytes in size.  Failure to do 
 * so would require a "bounceio" operation involving allocating a temporary
 * buffer and copying.  To avoid this, we assume that the caller is passing 
 * an a *lkblk which meets these requirements!
 */

static int writeLockBlock(int nodeNum, DiskLockBlock *lkblk) {
	off_t offsetLock;

	// Paranoia checks
	if ((nodeNum != 0) && (nodeNum != 1)) {
		clulog(LOG_ERR, "writeLockBlock: Invalid node number %d.\n",
			nodeNum);
		return(-1);
	}
        if (lockSubsysInitialized == 0) {
	    if (initLockSubsys() != 0) {
	        clulog(LOG_ERR, "writeLockSubsys: Subsystem init failure.\n");
	        return(-1);
	    }
        }
	if (lkblk->magic_number != LOCK_BLOCK_MAGIC_NUMBER) {
		clulog(LOG_ERR, "writeLockBlock: invalid magic# 0x%lx\n",
			lkblk->magic_number);
		return(-1);
	}
    	offsetLock = (OFFSET_FIRST_LOCK_BLOCK + 
			(nodeNum * SPACE_PER_LOCK_BLOCK));

	return diskRawWriteShadow(offsetLock, (char *)lkblk, sizeof(DiskLockBlock),
				  (ulong)&((DiskLockBlock *)0)->check_sum);
}

/*
 * Reads in the lock block from the shared partition.
 * NOTE: Raw IO operations are required to be 512 byte aligned and of
 * lengths which are a multiple of 512 bytes in size.  Failure to do 
 * so would require a "bounceio" operation involving allocating a temporary
 * buffer and copying.  To avoid this, we assume that the caller is passing 
 * an a *lkblk which meets these requirements!
 * Stuffing the results into the passed data struct.
 * Returns: -1 on error, 0 on success.
 */
static int readLockBlock(int nodeNum, DiskLockBlock *lkblk) {
	off_t offsetLock;
	int ret;

	if ((nodeNum != 0) && (nodeNum != 1)) {
		clulog(LOG_ERR, "readLockBlock: Invalid node number %d.\n",
			nodeNum);
		return(-1);
	}
        if (lockSubsysInitialized == 0) {
	    if (initLockSubsys() != 0) {
	        clulog(LOG_ERR, "readLockBlock: Subsystem init failure.\n");
	        return(-1);
	    }
        }
    	offsetLock = (OFFSET_FIRST_LOCK_BLOCK + 
			(nodeNum * SPACE_PER_LOCK_BLOCK));

	ret = diskRawReadShadow(offsetLock, (char *)lkblk, sizeof(DiskLockBlock),
				  (ulong)&((DiskLockBlock *)0)->check_sum, 0);

	if(ret) {
		clulog(LOG_ERR, "readLockBlock: bad ret %d from diskRawReadShadow\n", ret);
		return(ret);
	}		
	if (lkblk->magic_number != LOCK_BLOCK_MAGIC_NUMBER) {
		clulog(LOG_ERR, "readLockBlock: Invalid magic # 0x%lx.\n",
			lkblk->magic_number);
		return(-1);
	} 
	return(0);
}

void printLockBlock(DiskLockBlock *lkblk, int nodeNum) {

    clulog(LOG_DEBUG, "Lock Block, Node %d ------------\n", nodeNum);
    clulog(LOG_DEBUG, "magic# = 0x%lx\n", lkblk->magic_number);
    clulog(LOG_DEBUG, "version = %d\n", lkblk->version);
    clulog(LOG_DEBUG, "lockData = 0x%x ", lkblk->lockData);
    if (lkblk->lockData == DISK_LOCK_FREE) {
	clulog(LOG_DEBUG, " FREE\n");
    }
    else if (lkblk->lockData == DISK_LOCK_TAKEN) {
        clulog(LOG_DEBUG, " TAKEN\n");
    }
    else {
	clulog(LOG_DEBUG, " ERROR - Unknown lock data value!\n");
    }
    clulog(LOG_DEBUG, "------------------------------\n");
}

/*
 * Debug routine to print out the contents of both lock blocks.
 * Also called as part of the read/repair facility.
 * Param: doPrint - nonzero will cause the actual lock values to be printed
 * in the log file, otherwise the lock structs are read silently.
 */
void printBothLockBlocks(int doPrint) {
    int i, retval;
    int node_count;
    DiskLockBlock *lkblk;

    
    if (lockSubsysInitialized == 0) {
	    if (initLockSubsys() != 0) {
	        clulog(LOG_DEBUG, "printBothLockBlocks: Subsystem init failure.\n");
	        return;
	    }
    }
    lkblk = (DiskLockBlock *)allocAlignedBuf();
    if (lkblk == MAP_FAILED) {
	clulog(LOG_DEBUG, "printBothLockBlocks: unable to allocate aligned buffer.\n");
	return;
    }
    node_count = 0;
    if (doPrint) {
        clulog(LOG_DEBUG, "==================== Lock Blocks ============================\n");
    }
    for (i=0; i<MAX_NODES; i++) {
	retval = readLockBlock(i, lkblk);
        if (retval != 0) {
            clulog(LOG_DEBUG, "printBothLockBlocks: unable to read block %d.\n", i);
            freeAlignedBuf((char *)lkblk);
            return;
        }
	if (doPrint) {
	    printLockBlock(lkblk, i);
	}
    }
    if (doPrint) {
        clulog(LOG_DEBUG, "==============================================================\n");
    }
    freeAlignedBuf((char *)lkblk);
}

/*
 * Initialize the on-disk data structures representing lock data.
 * This will later be overwritten when the disk locking subsystem
 * is initialized.  Its main purpose is to wipe the disk to a clean slate.
 *
 * Typically this would be called outside normal operating context, so it
 * first checks to see if the subsystem has been initialized, and if not,
 * it opens it up.
 *
 * Returns: 0 on success.
 */
int initializePartitionLockBlocks() {
    DiskLockBlock *lkblk;
    int retval;
    int i;
    int init_done_here = 0;

    if (lockSubsysInitialized == 0) {
	if (initLockSubsys() != 0) {
	    clulog(LOG_DEBUG, "initializePartitionLockBlocks: unable to init disk lock subsystem.\n");
	    return(-1);;
	}
	init_done_here = 1;
    }

    lkblk = (DiskLockBlock *)allocAlignedBuf();
    if (lkblk == MAP_FAILED) {
	clulog(LOG_DEBUG, "initializePartitionLockBlocks: unable to allocate aligned buffer.\n");
	return(-1);;
    }

    /*
     * Just wiping out any prior settings.
     */
    for (i=0; i < MAX_NODES; i++) {
	initLockBlock(lkblk);
    	retval = writeLockBlock(i, lkblk);
        if (retval != 0) {
	    clulog(LOG_ERR, "initializePartitionLockBlocks: unable to initialize partition lock blocks.\n");
    	    freeAlignedBuf((char *)lkblk);
	    return(retval);
        }
    }
    freeAlignedBuf((char *)lkblk);
    if (init_done_here) {
	if (closeLockSubsys() != 0) {
	    clulog(LOG_ERR, "initializePartitionLockBlocks: unable to close lock subsys.\n");
	}
    }
    clulog(LOG_DEBUG, "initializePartitionLockBlocks: successfully initialized %d lock blocks.\n", MAX_NODES);
    return(0);
}

/*
 * Externally accessible API used to read the on-disk lock data and
 * return its value.
 *
 * Returns: non-negative value - success, lock data
 *	   -1 - lock description is not active, meaning that it was never
 *		initialized, or the lock subsystem was not initialized.
 * Side Effect: reboots if it can't access the lock data on the shared disk.
 * Assumption: This code assumes that the caller is serializing all requests
 * to lockClear, lockRead, lockWrite.  Necessary as there are only 2 aligned
 * buffers for RAW IO.
 */
int lockRead(int nodeNum, DiskLockBlock *lock_block) {
    int retval;
    int retries = 0;

    while (retries++ < max_consecutive_io_errors) {
        retval = readLockBlock(nodeNum, lock_block); 
	if (retval == 0) {
		return 0;
	}
    }
    /*
     * Inability to read from the shared state partition constitutes
     * unsafe operation.  Initiate a clean shutdown in the hopes that
     * some cleanup can be done before we inevitably get shot.
     */
    consider_shutdown("Cluster Instability: can't read lock block.");
    return(-1);
}

/*
 * Externally accessible API used to update the disk resident representation
 * of a node's lock state based on the passed parameter value.
 *
 * Returns: 0 - success
 *	   -1 - invalid parameter or subsystem uninitialized
 * Side Effect: reboots if it can't access the lock data on the shared disk.
 * Assumption: This code assumes that the caller is serializing all requests
 * to lockClear, lockRead, lockWrite.  Necessary as there are only 2 aligned
 * buffers for RAW IO.
 */
int lockWrite(int nodeNum, DiskLockBlock *lock_block) {
    int retval;
    int retries = 0;

    lock_block->magic_number = LOCK_BLOCK_MAGIC_NUMBER;
    lock_block->version = LOCK_BLOCK_LATEST_VERSION;
    while (retries++ < max_consecutive_io_errors) {
        retval = writeLockBlock(nodeNum, lock_block); 
	if (retval == 0) {
	    return(0);
	}
    }
    /*
     * Inability to write to the shared state partition constitutes
     * unsafe operation.  Initiate a clean shutdown in the hopes that
     * some cleanup can be done before we inevitably get shot.
     */
    consider_shutdown("Cluster Instability: can't write lock block.");
    return(-1);
}

/*
 * Externally accessible API used to reset the on-disk representation of
 * a node's lock data.  This is called in 2 places:
 * 1) When a node is declared down, the surviving node will reset the failed
 *    node's lock data to indicate that it is free.  This is the only time 
 *    you write to the other node's disk based lock structure.
 * 2) When a node starts up, it resets its own lock block state to indicate
 *    that it isn't holding any locks.
 *
 * Returns: 0 - success
 *	   -1 - invalid parameter, or lock subsystem not initialized.
 *	   -2 - unable to access the lock description on disk.
 * Side Effect: reboots if it can't access the lock data on the shared disk.
 * Assumption: This code assumes that the caller is serializing all requests
 * to lockClear, lockRead, lockWrite.  Necessary as there are only 2 aligned
 * buffers for RAW IO.
 */

int lockClear(int nodeNum) {
    int retval;
    DiskLockBlock lock_block;
    void *pc;

    pc = __builtin_return_address(0);
    bzero(&lock_block, sizeof(DiskLockBlock));

    clulog(LOG_DEBUG, "lockClear: resetting lock data for node %d to FREE.\n",nodeNum);
    if ((nodeNum != 0) && (nodeNum != 1)) {
	clulog(LOG_ERR, "lockClear: Invalid node number %d.\n", nodeNum);
	return(-1);
    }
    if (lockSubsysInitialized == 0) {
	    if (initLockSubsys() != 0) {
	        clulog(LOG_ERR, "lockClear: Subsystem init failure.\n");
	        return(-1);
	    }
    }
    lock_block.lockData = 0;
    lock_block.holder_pc = (ulong)pc;
    lock_block.holder_pid = getpid();

    retval = lockWrite(nodeNum, &lock_block);

    if (retval != 0) {
	clulog(LOG_EMERG, "lockClear: unable to write lock block %d.\n", nodeNum);
	return(-2);
    }
    return(0);
}


