Download
[here].
#!/bin/sh
## Program Name: scancluster
##
## Runs arbitrary commands on cluster nodes and
## collects interesting lines from system log
## files across the nodes of a beowulf cluster
## and writes them to a timestamped output file.
##
## All commands are added to an internal database
## of commands, and are accessible via an index,
## and may be combined for complex reports.
##
## In most cases, this script must be run on the
## gateway node of the beowulf cluster.
##
## Note that nodes are assumed to be named
## using a simple basename and index schema.
##
## RUN THIS SCRIPT WITH THE -help OPTION FOR A
## USEFUL SUMMARY OF IT'S BEHAVIOR!
##
## In addition to this script, it is a good idea
## to run:
##
## recon -v bootschema.lam
##
## and
##
## lamboot -d bootschema.lam && lamclean
##
## to diagnose lam related non-system issues.
##
## By: Phil Ehrens for The LIGO Lab at Caltech 2004
##
## The next line tells sh to execute the script using tclshexe \
exec tclsh "$0" ${1+"$@"}
# declare all options required for this script and for embedded
# commands here. Comment the options in the 'usage' procedure
# below!
set ::option_defaults [ list \
-help 0 \
-basename node \
-nodes 1-400 \
-machines [ list ] \
-taillines 10 \
-asynchronous 0 \
-user $env(USER) \
-syslog /var/log/messages \
-logpat 'oops|error|panic' \
-runlog scancluster.log \
-commands [ list 1 2 ] \
-documentation "no comment given for next cmd" \
-useansicolors 1 \
]
## ********************************************************
##
## Name: usage
##
## Description:
##
## Parameters:
##
## Usage:
##
## Comments:
##
proc usage { args } {
set commands [ print_commands ]
puts stderr "
USAGE:
scancluster -option1 value -option2 value ...
Where available options are:
-basename node name base (default: node)
-nodes node indices, cslists and ranges
i.e. \"1,3,5,10-20\"
-machines a list of machine names which, if this
option is used, overrides the -basename
and -nodes options, causing the script
to iterate over the explicitly declared
list of machine names.
-taillines number of lines of log tail to report
(default: 10)
-asynchronous run tasks concurrently in background
logging final state in order that the
commands return.
\$node.tmp files will be created as the
tasks progress, and final log entries
will be made when completed.
this makes the progress of the job more
difficult to follow, but for things like
badblocks, which can run for a very long
time, it is useful.
(default: 0 - set to 1 for concurrency)
-user user to run as (defaults to USER)
-syslog name of system log file to query
(default: /var/log/messages)
-logpat interesting log line filter egrep regexp
(default: 'oops|error|panic'
-runlog base name of log file this program will
write to. A timestamp will be appended
as an extension.
(default: scancluster.log)
setting '-runlog stdout' will cause output
to be written to stdout.
-commands list of commands to run
(default: \"1 2\")
* Running the script like this, for example:
./scancluster -commands \"1 2 {ps -al}\" -documentation \"ps -al\"
Will add a new command and a description of the command
AND will execute scancluster running commands 1, 2 and
the new command.
-documentation documentation line for new command, if
any was given.
-help displays this message, but does so using
the values of any other arguments to
update internal information first, so that
values of options are displayed as they
would be used in the command summary.
This is useful for debugging.
-useansicolors adds some color to the output for smart
terminals.
(default: 0 - set to 1 to colorize!)
CURRENTLY AVAILABLE COMMANDS: (command '0' is prepended to all
other commands internally.)
$commands\n\n"
exit
}
## ********************************************************
## ********************************************************
##
## Name: numRange
##
## Description:
## Expand embedded numerical ranges, i.e. 12-24 in
## { 1 2 3 4 12-24 } in place to create lists of
## integers, i.e. { 12 13 14 15 16... }
## Tries to handle all sorts of badness.
##
## Parameters:
## numbers - a Tcl list of numbers and ranges.
##
## Usage:
## numRange "1 2 3 4 5-12 13..21, 23, 24"
## Comments:
##
proc numRange { numbers } {
set nums {}
;## try to handle comma delimited lists, yeccchh.
regsub -all {\,\s*} $numbers " " numbers
;## and accomodate that other way of doing things.
regsub -all {\.\.} $numbers "-" numbers
foreach elem $numbers {
set elem [ string trim $elem ]
if { [ regexp {^(\d+)-(\d+)$} $elem -> begin end ] } {
if { $end < $begin } {
return -code error "Bad range: '$begin-$end'"
}
for { set i $begin } { $i <= $end } { incr i } {
lappend nums $i
}
} else {
lappend nums $elem
}
}
return $nums
}
## ********************************************************
## ********************************************************
##
## Name: command_manager
##
## Description:
##
## Parameters:
##
## Usage:
## Running the script like this, for example:
##
## ./scancluster -commands "1 2 {ps -al}" -documentation "ps -al"
##
## Will add a new command and a description of the command
## AND will execute scancluster running commands 1, 2 and
## the new command.
##
## Comments:
# #
proc command_manager { newcmd { comment none } } {
if { [ catch {
set available [ list ]
set cmd_rx {set\s+::Command\((\d+)\)\s+}
set script $::argv0
set fid [ open $script r ]
set old [ read $fid [ file size $script ] ]
close $fid
set fid [ open $script w ]
set done 0
foreach line [ split $old "\n" ] {
if { [ regexp $cmd_rx $line -> n ] } {
set done $n
append available "$line\n"
} elseif { $done > 0 && ! [ string match #* $line ] } {
set newcmd "set ::Command([ incr done ]) \"$newcmd\""
eval $newcmd
append available "$newcmd\n"
puts $fid "# $comment"
puts $fid "$newcmd"
lappend ::_commands $done
set done 0
}
puts $fid $line
}
close $fid
} err ] } {
return -code error "command_manager: $err"
}
}
## ********************************************************
## ********************************************************
##
## Name: parseopts
##
## Description:
##
## Usage:
##
## Comments:
## An ambiguous option (one that matches more than one
## default) will throw an exception.
proc parseopts { inputs } {
set matches [ list ]
;## get inputs and massage them into a well-formed list
regsub -all -- {\s+} $inputs { } inputs
regexp {^\{(.+)\}$} $inputs -> inputs
;## and GO! (trimming loose spaces from values)
foreach [ list option default ] $::option_defaults {
set option [ string trim $option - ]
set matched 0
foreach { name value } $inputs {
set name [ string trim $name ]
regsub -- {^[-]+} $name {} name
set value [ string trim $value ]
if { [ string length $value ] == 0 && \
! [ string equal h $name ] } {
set err "parseopts: option \"$name\" is missing value!"
return -code error $err
}
if { [ unrecognised_option $name ] } {
set err "unrecognised option: '-$name'\n"
append err "run '$::argv0 -h' for help\n"
return -code error $err
}
if { [ regexp -nocase -- ^$name $option ] } {
;## next test fails if ambiguous item is last
;## option on command line...
if { [ lsearch -exact $matches $name ] > -1 } {
set err "parseopts: ambiguous option: \"$name\""
return -code error $err
}
lappend matches $name
set ::$option $value
set matched 1
break
}
}
if { $matched == 0 } {
set ::$option $default
}
}
}
## ********************************************************
## ********************************************************
##
## Name: unrecognised_option
##
## Description:
##
## Parameters:
##
## Usage:
##
## Comments:
##
proc unrecognised_option { useropt } {
if { [ catch {
set unmatched 1
foreach [ list option default ] $::option_defaults {
if { [ regexp -nocase -- ^-$useropt $option ] } {
set unmatched 0
break
}
}
} err ] } {
return -code error "unrecognised_option: $err"
}
return $unmatched
}
## ********************************************************
## ********************************************************
##
## Name: print_commands
##
## Description:
## Helper function for usage procedure.
##
## Parameters:
##
## Usage:
##
## Comments:
##
proc print_commands { args } {
if { [ catch {
foreach index [ lsort [ array names ::Command ] ] {
set command $::Command($index)
append text "$index: $command\n "
}
} err ] } {
return -code error "print_commands: $err"
}
return $text
}
## ********************************************************
## ********************************************************
##
## ANSI Coloring stuff
##
set ansimap [ list \
normal 0 bold 1 \
light 2 blink 5 \
invert 7 \
black 30 red 31 \
green 32 yellow 33 \
blue 34 purple 35 \
cyan 36 white 37 \
Black 40 Red 41 \
Green 42 Yellow 43 \
Blue 44 Purple 45 \
Cyan 46 White 47 ]
proc + { args } {
if { [ string equal 1 $::useansicolors ] } {
set t 0
foreach i $args {
set ix [ lsearch -exact $::ansimap $i ]
if { $ix > -1 } {
lappend t [ lindex $::ansimap [ incr ix ] ]
}
}
return "\033\[[join $t {;}]m"
} else {
return [ list ]
}
}
## ********************************************************
## ********************************************************
##
## Name: shellpipe
##
## Description:
## Shell wrapper that redirects stderr to stdout.
##
## Returns as soon as data are available OR after a
## timeout, whichever comes first.
##
## Parameters:
##
## Usage:
##
## cmd must be listified
##
## subcommands passed to ssh MUST be enclosed in
## single quotes!!
##
## Comments:
## Can only return the FIRST error from the command
##
proc shellpipe { cmd node i } {
if { [ catch {
set cmd [ subst -nocommands $::Command(0) ]
set fid [ open "|$cmd 2>@stdout" w+ ]
fconfigure $fid -blocking off
fconfigure $fid -buffering line
fileevent $fid readable [ list handle $fid $node $i ]
handle $fid $node $i
} err ] } {
catch { close $fid }
set ::data($node,$i) "shellpipe: $err"
handle ERROR $node
}
return $fid
}
## ********************************************************
## ********************************************************
##
## Name: handle
##
## Description:
##
## Parameters:
##
## Usage:
##
## Comments:
##
proc handle { fid node i } {
if { [ catch {
if { [ string equal ERROR $fid ] } {
set data ERROR
} else {
set data [ read $fid ]
if { [ string length $data ] } {
append ::data($node,$i) $data
}
}
if { [ string equal ERROR $fid ] || [ eof $fid ] } {
catch { close $fid }
puts stderr \
"$node task $i: [ + bold purple ]done![ + normal ]"
report $node $data 1
} elseif { [ string length $data ] } {
report $node $data 0
}
} err ] } {
catch { close $fid }
report $node "handle: $err" 1
}
}
## ********************************************************
## ********************************************************
##
## Name: report
##
## Description:
##
## Parameters:
##
## Usage:
##
## Comments:
##
proc report { node data done } {
if { [ catch {
if { $done == 1 } {
if { [ info exists ::tempfilefid($node) ] } {
set fid $::tempfilefid($node)
close $fid
unset ::tempfilefid($node)
}
foreach name [ array names ::data $node,* ] {
set data $::data($name)
puts $::logfid $data
unset ::data($name)
}
file delete -force ${node}.tmp
if { [ llength [ array names ::data ] ] == 0 } {
# there was a race condition when hosts
# were unreachable.
after 1000 finish
}
} else {
if { ! [ info exists ::tempfilefid($node) ] } {
set fid [ open ${node}.tmp a+ ]
fconfigure $fid -buffering line
set ::tempfilefid($node) $fid
}
set fid $::tempfilefid($node)
puts $fid $data
flush $fid
}
} err ] } {
catch { close $fid }
return -code error "report: $err"
}
}
## ********************************************************
## ********************************************************
##
## Name: finish
##
## Description:
##
## Parameters:
##
## Usage:
##
## Comments:
##
proc finish { args } {
if { [ llength [ array names ::data ] ] == 0 } {
set ::DONE 1
}
}
## ********************************************************
## ********************************************************
##
## Name: concurrent
##
## Description:
##
## Parameters:
##
## Usage:
##
## Comments:
##
proc concurrent { node } {
set i -1
foreach [ list running msg ] \
[ sockdiagnostic $node ] { break }
if { $running } {
foreach cmd [ lsort -unique $::_commands ] {
incr i
set cmd [ set ::Command($cmd) ]
set ::data($node,$i) \
"[ + bold red ]$node *************** $node[ + normal ]\n"
append ::data($node,$i) \
"[ + bold blue ]COMMAND: $cmd[ + normal ]\n"
shellpipe $cmd $node $i
}
} else {
puts stderr \
"\n$node: [ + bold red ]ERROR![ + normal ]"
set ::data($node,$i) \
"[ + bold red ]$node *************** $node[ + normal ]\n"
append ::data($node,$i) \
"[ + bold blue ]SOCKET DIAGNOSTIC FAILURE: "
append ::data($node,$i) "[ + bold red ]$msg[ + normal ]\n"
report $node $msg 1
}
}
## ********************************************************
## ********************************************************
##
## Name: nonconcurrent
##
## Description:
##
## Parameters:
##
## Usage:
##
## Comments:
##
proc nonconcurrent { node } {
set fid $::logfid
puts $fid \
"[ + bold red ]$node *************** $node[ + normal ]"
foreach [ list running msg ] \
[ sockdiagnostic $node ] { break }
if { $running } {
foreach cmd [ lsort -unique $::_commands ] {
set cmd [ set ::Command($cmd) ]
puts $fid "[ + bold blue ]COMMAND: $cmd[ + normal ]\n"
catch { set data [ eval exec $::Command(0) ] } data
puts $fid "$data\n"
}
} else {
puts $fid \
"[ + bold blue ]SOCKET DIAGNOSTIC FAILURE: [ + bold red ]$msg[ + normal ]\n"
}
}
## ********************************************************
## ********************************************************
##
## Name: create_nodelist
##
## Description:
##
## Parameters:
##
## Usage:
##
## Comments:
##
proc create_nodelist { args } {
if { [ catch {
set nodelist [ list ]
if { [ llength $::machines ] > 0 } {
set nodelist $::machines
} else {
set nodes [ numRange $::nodes ]
foreach node $nodes {
lappend nodelist $::basename$node
}
}
} err ] } {
return -code error "[ myName ]: $err"
}
return $nodelist
}
## ********************************************************
## ********************************************************
##
## Name: sockdiagnostic
##
## Description:
## Try to diagnose machines ability to function within the
## LDAS system. The ping times for this procedure are
## comparable to those of the UNIX ping command.
##
## Parameters:
##
## Usage:
##
## Comments:
## Uses the default ssh port to determine whether a machine
## is alive and able to talk.
proc sockdiagnostic { host { port 22 } } {
set running 1
set msg "ok"
set timeout 2000
;## parse hostname from foo@hostname:NNN
;## cannot recall why we need to do this!!
if { [ regexp {\@([^\:]+)} $host -> tmp ] } {
set host $tmp
}
;## calculate time periods equal to 1%, 10% and 100%
;## of the ping timeout panic limit.
set short [ expr { round($timeout / 100.0) } ]
set medium [ expr { $short * 9 } ]
set long [ expr { $short * 90 } ]
if { [ catch {
set sid [ socket -async $host $port ]
;## three stage ping response eval. once after 1% of
;## the panic limit, again after 10%, then at the limit.
after $short
if { [ catch { fconfigure $sid -peername } ] } {
after $medium
if { [ catch { fconfigure $sid -peername } ] } {
after $long
if { [ catch { fconfigure $sid -peername } ] } {
set running 0
set msg "$host is down or has no service"
append msg " on port $port"
} else {
set msg "$host ping more than ${short}ms and "
append msg "less than ${timeout}ms"
puts stderr $msg
}
}
}
if { $running == 1 } {
set err [ fconfigure $sid -error ]
if { [ string length $err ] } {
set running 0
set msg "error reported on ${host}:${port}: $err"
}
}
} err ] } {
set running 0
set msg "host $host unreachable"
}
catch { ::close $sid }
return [ list $running $msg ]
}
## ********************************************************
## ********************************************************
##
## Name: openlog
##
## Description:
##
## Parameters:
##
## Usage:
##
## Comments:
##
proc openlog { args } {
# the output will get a unique filename with a timestamp
if { [ string equal stdout $::runlog ] } {
if { $::asynchronous == 1 } {
set err "I can't write to STDOUT in ASYNCHRONOUS MODE!!"
puts stderr "\n\n$err\n\n"
exit
}
set ::logfid stdout
} else {
set ::mylog $::runlog.[ clock seconds ]
puts stderr "output will be written to: '$::mylog'"
set ::logfid [ open $::mylog w ]
}
}
## ********************************************************
## ********************************************************
proc bgerror { msg } {
set trace {}
catch { set trace $::errorInfo }
if { [ string length $trace ] } {
set msg $trace
}
set time [ clock format [ clock seconds ] -format "%m/%d/%y-%r %Z" ]
puts stderr "$time $msg"
}
## ********************************************************
parseopts $argv
;## ********************************************************
;## ************** BEGIN COMMAND DATABASE ******************
;## ********************************************************
# These are the remotely executable commands available.
# Note that Command '0' is the rsh base command.
# An ssh base command might be:
# ssh -x -n -obatchmode=yes \$node \$cmd 2>@stdout
# -n can be -odontreadstdin=yes for clarity.
# or not...
set ::Command(0) "rsh -l $user \$node \$cmd"
# find critical system log file entries
set ::Command(1) "egrep -i $logpat $syslog"
# report the last N log file entries unfiltered
set ::Command(2) "tail -n $taillines $syslog"
# find all lamd processes and get useful data about them
set ::Command(3) "ps -Ao fname,pid,user,etime,state | grep lamd"
# find all search users processes and get useful data about them
set ::Command(4) "ps -Ao fname,pid,user,etime,state | grep search"
# date
set ::Command(5) "date +'%A %B %d %Y %r %Z'"
;## ********************************************************
;## **************** END COMMAND DATABASE ******************
;## ********************************************************
## ********************************************************
##
## Name: main loop
##
## Description:
## Bare code. Not a procedure!
##
## Parameters:
##
## Usage:
##
## Comments:
##
foreach command $commands {
if { [ regexp {^[0-9]+$} $command ] } {
lappend ::_commands $command
} else {
command_manager $command $documentation
}
}
# usage if -h
if { [ regexp -nocase -- {\s+-+h} $::argv ] || \
[ regexp -nocase -- {^-+h} $::argv ] } {
usage
}
# the output will get a unique filename with a timestamp
openlog
set ::nodelist [ create_nodelist ]
puts -nonewline stderr "scanning: "
set i 0
foreach node $::nodelist {
if { $::asynchronous == 1 } {
concurrent $node
} elseif { $::asynchronous == 0 } {
nonconcurrent $node
} else {
puts stderr "\n\n-asychronous option can ONLY be 0 or 1.\n\n"
exit
}
if { ! [ string equal stdout $runlog ] } {
incr i
if { [ expr {$i % 10} ] } {
puts -nonewline stderr [ + bold blue ]*[ + normal ]
} else {
puts -nonewline stderr [ + bold red ]$i[ + normal ]
}
}
}
if { $::asynchronous == 0 } {
puts stderr " [ + bold red ]DONE![ + normal ]"
close $::logfid
exit
} else {
puts stderr " [ + bold red ]WAITING...[ + normal ]"
vwait ::DONE
puts stderr " [ + bold red ]DONE![ + normal ]"
close $::logfid
exit
}
## ********************************************************