# status_checker.tcl --
#
#       FIXME: This file needs a description here.
#
# Copyright (c) 1997-2002 The Regents of the University of California.
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# A. Redistributions of source code must retain the above copyright notice,
#    this list of conditions and the following disclaimer.
# B. Redistributions in binary form must reproduce the above copyright notice,
#    this list of conditions and the following disclaimer in the documentation
#    and/or other materials provided with the distribution.
# C. Neither the names of the copyright holders nor the names of its
#    contributors may be used to endorse or promote products derived from this
#    software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS''
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR
# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# @(#) $Header: /usr/mash/src/repository/mash/mash-1/tcl/applications/pathfinder/status_checker.tcl,v 1.4 2002/02/03 04:22:06 lim Exp $


#
# State 1: (system successfully running)
#   while (successful ping) { }
#   send email "mash_server crashed, trying to restart"
#   goto state 0
#
# State 0: (system not responding)
#   kill mash_server
#   restart mash_server
#   if (successful ping before timeout) {
#     goto state 1
#   } else {
#     send email "restart failed, will keep trying"
#     goto state -1
#   }
#
# State -1: (restart failed)
#   while ( !(successful ping) ) { }
#   send email "mash_server back up"
#   goto state 1
#

import MTrace
import Configuration

Class Status_Checker

Status_Checker public init { } {

    $self instvar email_addr_ server_addr_ server_port_ \
	    sendmail_command_ server_command_ output_dir_ \
	    ping_freq_

    # Load the mserver preferences.
    set o [$self options]
    $o load_preferences "mserver"
    set server_addr_ [$self get_option server_addr]
    set server_port_ [$self get_option server_port]

    # Initialize the email address to which error messages will be sent.
    set email_addr_ [$self get_option email_addr]

    # Locate the sendmail binary using either the options file or the
    # the "which" command and create the command to be used by the
    # send_msg method.
    set sendmail_command_ [$self get_option sendmail_command]
    if { $sendmail_command_ == "" } {
	if [catch { set sendmail_command_ [exec which sendmail] }] {
	    puts "sendmail command not found; please specify path in\
		    prefs-mserver file using sendmail_command."
	    exit
	}
    }
    append sendmail_command_ " -t"

    # Determine the path for the mash_server from either the options
    # directory or by locating it using "which."
    set server_command_ [$self get_option server_command]
    if { $server_command_ == "" } {
	if [catch { set server_command_ [exec which mash_server] }] {
	    puts "mash_server command not found; please specify path\
		    in prefs-mserver file using server_command."
	    exit
	}
    }

    # Find the server's output directory from the options file.
    set output_dir_ [$self get_option output_dir]

    # Initialize the ping frequency from the options file. If ping
    # frequency equals x, then there are x seconds between pings.
    set ping_freq_ 30
    set ping_freq_ [$self get_option ping_freq]

    $self event_loop
}


Status_Checker private event_loop { } {

    $self instvar ping_freq_

    mtrace trcNet "In Status_Checker::event_loop"

    # Initialize the states to be used in the event loop and the
    # current state.
    set alive 1
    set not_responding 0
    set restart_failed -1
    set state $alive
    set second_try 0

    # Enter the event loop.
    while { 1 } {
	if { $state == $alive } {
	    mtrace trcNet "-> In 'alive' state"
	    while { [$self ping] } {
		exec sleep $ping_freq_
	    }
	    mtrace trcNet "-> Exiting 'alive' state"
	    set state $not_responding

	} elseif { $state == $not_responding } {
	    mtrace trcNet "-> In 'not responding' state"
	    $self kill_server
	    set output_file [$self restart_server]
	    $self send_msg "mash_server crashed; trying to restart\ndebug output file: $output_file"
	    exec sleep $ping_freq_
	    if { [$self ping] } {
		set state $alive
		set second_try 0
		$self send_msg "mash_server back up"

	    } elseif { $second_try == 0 } {
		$self send_msg "restart failed, trying again"
		set second_try 1
	    } else {
		$self send_msg "restart failed"
		set second_try 0
		set state $restart_failed
	    }

	} elseif { $state == $restart_failed } {
	    mtrace trcNet "-> In 'restart failed' state"
	    while { ![$self ping] } {
		exec sleep $ping_freq_
	    }
	    $self send_msg "mash_server back up"
	    set state $alive
	}
    }
}


#
# This method returns true if the server responds to GET requests.
#
Status_Checker private ping { } {

    $self instvar server_addr_ server_port_

    # Open a socket connection to the server.
    if [catch { set socket [socket $server_addr_ $server_port_]}] {
	mtrace trcNet "-> Open socket failed."
	return 0
    }

    # Send a GET request to the mash_server.
    set get_request "GET / HTTP/1.0\r\n\r\n"
    puts -nonewline $socket $get_request
    flush $socket

    set buffer [gets $socket]
    close $socket

    if { $buffer == "" } {
	mtrace trcNet "-> Nothing read from socket."
	return 0
    }

    return 1
}


Status_Checker private send_msg { msg } {

    $self instvar email_addr_ sendmail_command_

    mtrace trcNet "In Status_Checker::send_msg"

    # Create the email message to be sent.
    set from_text "From: mash_server checker\n"
    set to_text "To: $email_addr_\n"
    set date [clock format [clock seconds] -format {%a, %d %B %Y %H:%M (%Z)}]
    set date_text "Date: $date\n"
    set sub_text "Subject: \n"
    set end_msg ".\n"

    append text $from_text $to_text $date_text $sub_text "\n$msg\n" $end_msg

    # Send the email message using the sendmail command.
    set command_id [open "| $sendmail_command_" w]
    puts $command_id i$text
    close $command_id

    mtrace trcNet "-> Exiting send_msg"
}


Status_Checker private kill_server { } {

    mtrace trcNet "In method Status_Checker::kill_server"

    if [catch { set process_line [exec ps | grep bin/mash_server] }] {
	mtrace trcNet "-> Server not running."
	return
    }

    mtrace trcNet "-> Killing server process."
    set pid [lindex [split $process_line] 0]
    exec kill $pid
}


Status_Checker private restart_server { } {

    $self instvar server_addr_ server_command_ output_dir_

    mtrace trcNet "In method Status_Checker::restart_server"

    cd $output_dir_

    # Move the old output file into an archived output file.
    set new_filename ""
    if { [file exists output.txt] } {
	set new_filename "output[clock seconds].txt"
	exec mv output.txt $new_filename
    }

    # For now, just test on the same machine.
    mtrace trcNet "-> Restarting the server."
    exec $server_command_ >& output.txt &
#    append output_file $output_dir_ output.txt
#    set command "$server_command_ >& $output_file &"
#    exec ssh -n $server_addr_ $command

    return $new_filename
}


MTrace init { trcNet }
set checker [new Status_Checker]

