This section describes sample set of out of memory OOM adjust scripts for cron and PBS prologue and epilogue.
Example A-1. oom_adj.user.pl.txt : OOM Adjustment Script
#!/usr/bin/perl
use strict;
use Sys::Hostname;
my $host = hostname();
my $DEBUG=0; # 0=turn off, 1=turn on
my $CALL_SCPT=$ARGV[0];
sub ResetOomAdj {
my $AVOID_UIDS;
my $_userid;
my $tpid;
my $CMD_LINE;
my $RETURN;
$AVOID_UIDS="root|100|nobody|ntp|USER|daemon|postfix|vtunesag";
open (PS_CMD, "-|") || exec 'ps -e -o user,pid';
while (<PS_CMD>) {
chomp;
($_userid, $tpid) = split (/\s+/, $_);
if ( $_userid !~ m/^${AVOID_UIDS}/ && $tpid =~ /^[0-9]/ && -e
"/proc/$tpid/oom_adj" ) {
print "$CALL_SCPT $host: Found processes to set to zero
oom_adj...\n" if $DEBUG;
$CMD_LINE="echo 0 > /proc/$tpid/oom_adj";
$RETURN=`$CMD_LINE`;
}
elsif ( $tpid =~ /^[0-9]/ && -e "/proc/$tpid/oom_adj" ) {
print "$CALL_SCPT $host: Found processes to set to protect
oom_adj...\n" if $DEBUG;
$CMD_LINE="echo -17 > /proc/$tpid/oom_adj";
$RETURN=`$CMD_LINE`;
}
}
close PS_CMD;
}
&ResetOomAdj();
|
Example A-3. prologue: Sample prologue Script
#!/bin/bash
##################################################################################
#
# Version: 2.3.1 : Updated 8/12/09
# Date: Oct 16, 2007
# Author: Scott Shaw, sshaw@sgi.com
#
# Script Name: PBS Pro Prologue Script
# The purpose of the Prologue script is to terminate leftover user processes and
# allocated IPCs resources. The prologue script consists of two scripts, the main
# prologue script and a chk_node.pl script. To minimize accessing each node the
# prologue script executes a parallel ssh shell across a set of nodes based on the
# PBS_NODEFILE. For large clusters over 64 nodes serial ssh access is slow so having
# a flexible parallel ssh to help speed up the clean-up process of each node. In
# some cases, a PBS jobs can normally terminate but some MPI implementations do not
# normally terminate the MPI processes due to crappy error code handling or
# segmentation faults within the MPI application thus leaving behind user processes
# still consuming system resources.
#
# When the prologue script is launched by PBS MOM the ssh session is executed and will
# execute the chk_node.pl script. The chk_node.pl script contains a series of clean-up
# commands which are executed on each node based on the PBS_NODEFILE.
#
# Execution of the prologue script is based on the root account.
#
# This script needs to reside on each execution host/node
# Location: /var/spool/PBS/mom_priv
# File name: prologue
# Permissions: 755
# Owner: root
# Group: root
#
# ls output: ls -l /var/spool/PBS/mom_priv/prologue
# -rwxr-xr-x 1 root root 2054 Sep 6 19:39 /var/spool/PBS/mom_priv/prologue
#
# Modification of the prologalarm maybe necessay if the network access is slow to
# each node. 30 seconds may not be enough time to check 256 nodes in a cluster.
# prologalarm # Defines the maximum number of seconds the prologue
# and prologue may run before timing out. Default:
# 30. Integer. Example:
# $prologalarm 30
#
##################################################################################
JOBID=$1
USERNAME=$2
GROUPNAME=$3
JOBNAME=$4
P_PID=$5
NPCUS=$6
CPU_PERCENT=$7
QUEUE=$8
TTY_TYPE=$9
UNKNOWN_ARG=$10
VERSION="v2.3.1"
SSHOPTS="-o StrictHostKeyChecking=no -o ConnectTimeout=6"
# If the cluster blade layout is not in sequentially than use a flat file.
NODES_FILE="/var/spool/PBS/aux/${JOBID}";
spawn ()
{
if [[ `jobs | grep -v Done | wc -l` -ge $1 ]]; then
wait
fi
shift
$@ &
}
exec_cmd ()
{
for HOSTNAME in $( cat ${NODES_FILE} | sort -u )
do
spawn 25 ssh ${SSHOPTS} ${HOSTNAME} $CMDLINE
done
wait
}
# main()
#Find PBS qstat command
if [ -f /usr/pbs/bin/qstat ]; then
QSTAT=/usr/pbs/bin/qstat
elif [ -f /opt/pbs/default/bin/qstat ]; then
QSTAT=/opt/pbs/default/bin/qstat
else
echo "Epilogue Error: The qstat command could not be detected, exiting..."
exit 1
fi
prefix_flag=`${QSTAT} -a ${JOBID} | grep "^[0-9]" |awk '{print $4}' | awk -F. '{print $1}'`
queue=`${QSTAT} -a ${JOBID} | grep "^[0-9]" |awk '{print $3}'`
echo "Start Prologue ${VERSION} `date` "
if [ $( /bin/uname -m ) = "x86_64" ]; then
echo "Prefix passed: ${prefix_flag}"
echo "destination queue: ${queue}"
case $prefix_flag in
TB)
# Enable turbo and do node cleanup
CMDLINE="/var/spool/PBS/mom_priv/chk_node.pl Plog ${queue} TB"
exec_cmd
;;
BP)
# Bypass the turbo setting and P/Elog cleanup
echo "* * * * Bypassing the PBS Prologue and Epilogue scripts * * * *"
;;
JT)
# Enable turbo but do not run the node cleanup p/elog scripts
CMDLINE="/var/spool/PBS/mom_priv/chk_node.pl Plog ${queue} JT"
exec_cmd
;;
NT)
# bypass turbo settings but run the node cleanup
CMDLINE="/var/spool/PBS/mom_priv/chk_node.pl Plog ${queue} NT"
exec_cmd
;;
*)
# disable turbo and run the node cleanup scripts
CMDLINE="/var/spool/PBS/mom_priv/chk_node.pl Plog ${queue}"
exec_cmd
esac
else
echo "The prologue script is intended to run on x86_64 nodes not `uname -m`."
echo "End Prologue ${VERSION} `date` "
exit -1
fi
echo "End Prologue ${VERSION} `date` "
#Output the cluster details file
if [ -f /var/spool/PBS/mom_priv/cluster_info.out ]; then
cat /var/spool/PBS/mom_priv/cluster_info.out
else
echo "WARNING: The cluster info file does not exist. Contact hpc_support and report this warning."
fi |
Example A-4. epilogue: Sample epilogue Script
#!/bin/bash
##################################################################################
#
# Version: 2.3.1 : Updated 8/12/09
# Date: Oct 16, 2007
# Author: Scott Shaw, sshaw@sgi.com
#
# Script Name: PBS Pro Epilogue Script
# The purpose of the epilogue script is to terminate leftover user processes and
# allocated IPCs resources. The epilogue script consists of two scripts, the main
# epilogue script and a chk_node.pl script. To minimize accessing each node the
# epilogue script executes a parallel ssh shell across a set of nodes based on the
# PBS_NODEFILE. For large clusters over 64 nodes serial ssh access is slow so having
# a flexible parallel ssh to help speed up the clean-up process of each node. In
# some cases, a PBS jobs can normally terminate but some MPI implementations do not
# normally terminate the MPI processes due to crappy error code handling or
# segmentation faults within the MPI application thus leaving behind user processes
# still consuming system resources.
#
# When the epilogue script is launched by PBS MOM the ssh session is executed and will
# execute the chk_node.pl script. The chk_node.pl script contains a series of clean-up
# commands which are executed on each node based on the PBS_NODEFILE.
#
# Execution of the epilouge script is based on the root account.
#
# This script needs to reside on each execution host/node
# Location: /var/spool/PBS/mom_priv
# File name: epilogue
# Permissions: 755
# Owner: root
# Group: root
#
# ls output: ls -l /var/spool/PBS/mom_priv/epilogue
# -rwxr-xr-x 1 root root 2054 Sep 6 19:39 /var/spool/PBS/mom_priv/epilogue
#
# Modification of the prologalarm maybe necessay if the network access is slow to
# each node. 30 seconds may not be enough time to check 256 nodes in a cluster.
# prologalarm # Defines the maximum number of seconds the prologue
# and epilogue may run before timing out. Default:
# 30. Integer. Example:
# $prologalarm 30
#
##################################################################################
JOBID=$1
USERNAME=$2
GROUPNAME=$3
JOBNAME=$4
P_PID=$5
NPCUS=$6
CPU_PERCENT=$7
QUEUE=$8
TTY_TYPE=$9
UNKNOWN_ARG=$10
VERSION="v2.3.1"
SSHOPTS="-o StrictHostKeyChecking=no -o ConnectTimeout=6"
# If the cluster blade layout is not in sequentially than use a flat file.
NODES_FILE="/var/spool/PBS/aux/${JOBID}";
spawn ()
{
if [[ `jobs | grep -v Done | wc -l` -ge $1 ]]; then
wait
fi
shift
$@ &
}
exec_cmd ()
{
for HOSTNAME in $( cat ${NODES_FILE} | sort -u )
do
spawn 25 ssh ${SSHOPTS} ${HOSTNAME} $CMDLINE
done
wait
}
# main()
#Find PBS qstat command
if [ -f /usr/pbs/bin/qstat ]; then
QSTAT=/usr/pbs/bin/qstat
elif [ -f /opt/pbs/default/bin/qstat ]; then
QSTAT=/opt/pbs/default/bin/qstat
else
echo "Epilogue Error: The qstat command could not be detected, exiting..."
exit 1
fi
prefix_flag=`${QSTAT} -a ${JOBID} | grep "^[0-9]" |awk '{print $4}' | awk -F. '{print $1}'`
queue=`${QSTAT} -a ${JOBID} | grep "^[0-9]" |awk '{print $3}'`
echo "Start Epilogue ${VERSION} `date` "
if [ $( /bin/uname -m ) = "x86_64" ]; then
echo "Prefix passed: ${prefix_flag}"
echo "destination queue: ${queue}"
case $prefix_flag in
TB)
# Enable turbo and do node cleanup
CMDLINE="/var/spool/PBS/mom_priv/chk_node.pl Elog reset"
exec_cmd
;;
BP)
# Bypass the turbo setting and P/Elog cleanup
echo "* * * * Bypassing the PBS Prologue and Epilogue scripts * * * *"
;;
JT)
# Enable turbo but do not run the node cleanup p/elog scripts
CMDLINE="/var/spool/PBS/mom_priv/chk_node.pl Elog reset JT"
exec_cmd
;;
NT)
# bypass turbo settings but run the node cleanup
CMDLINE="/var/spool/PBS/mom_priv/chk_node.pl Elog noreset NT"
exec_cmd
;;
*)
# disable turbo and run the node cleanup scripts
CMDLINE="/var/spool/PBS/mom_priv/chk_node.pl Elog reset"
exec_cmd
esac
else
echo "The epilogue script is intended to run on x86_64 nodes not `uname -m`."
echo "End Epilogue ${VERSION} `date` "
exit -1
fi
echo "End Epilogue ${VERSION} `date` " |
Example A-5. chk_node.pl.txt : Script epilogue and prologue Use.
#!/usr/bin/perl
# Version: 2.3.1 : Updated 8/12/09
# Orig Date: Oct 10, 2007
# Author: Scott Shaw, sshaw@sgi.com
#
# This perl script is called by PBS Pro prologue and epilogue scripts when
# a user submits a job through PBS Pro. The purpose of this script is to
# sanitize a range of nodes identified by the $PBS_NODEFILE list by
# terminating old user processes, old ipc allocations, temp files,
# and to flush the system buffer cache.
#
# Changes:
# 2/1/08 sshaw@sgi.com
# - Added a subroutine to clean-up /tmp directory
# - changed system() to exec since it was corrupting memory
# - declared all vars to be local to subroutine, before it was loosely defined
# - added strict checking of perl script
# 3/24/08 sshaw@sgi.com
# - fixed debug conditional
# - cleaned up the CleanUpProcesses procedure and added which processes
# and user being terminated.
# - Changed the killall to pkill due to userid > 8 chars
# 11/13/08 sshaw@sgi.com
# - added a subroutine to clean-up /dev/shm since several users
# use this location for temporary scratch space.
# 03/31/09 sshaw@sgi.com
# - added subroutines to enable/disable Turbo mode on Intel series 5500 CPUs
# 04/22/09 sshaw@sgi.com
# - added subroutines to speed step the core processor frequency to a lower freq
# 08/12/09 sshaw@sgi.com
# - fixed minor issues with setting the frequency and fixed cpu freq to max speed
use strict;
use Sys::Hostname;
my $host = hostname();
my $DEBUG=1; # 0=turn off, 1=turn on
my $CALL_SCPT=$ARGV[0];
my $queue_destination=$ARGV[1];
my $prefix_option=$ARGV[2];
my $set_freq=0;
#####
# The following lines are added for Turbo/SMT mode starting with Intel 5500 series CPUs
my $rdmsr = "/var/spool/PBS/mom_priv/rdmsr";
my $wrmsr = "/var/spool/PBS/mom_priv/wrmsr";
my $msr = "0x199";
my $tbit = 1 << 32;
# Several MPI implementations or MPI applications use IPC shared memory. When
# a MPI application abnormally terminates it leaves behind allocated resources.
# this subroutine will remove any IPC resources allocated for the user's job.
sub CleanUpIPC_Table {
my $tkey;
my $tshmid;
my $towner;
my $tperms;
my $tbytes;
my $tnattch;
my $tstatus;
my $CMD_LINE;
my $RETURN;
open(IPC_SHARMEM, "-|") || exec 'ipcs -m';
while () {
chomp;
($tkey, $tshmid, $towner, $tperms, $tbytes, $tnattch, $tstatus) = split (/\s+/, $_);
if ( $tkey =~ /^[0-9]/ ) {
if ( $towner !~ m/root|^ / ) {
print "$CALL_SCPT $host: Found IPC_SHR_MEM allocation: $tshmid $towner, terminating...\n" if $DEBUG;
$CMD_LINE="ipcrm -m $tshmid";
$RETURN=`$CMD_LINE`;
}
}
}
close IPC_SHARMEM;
}
# This subroutine will parse the process list and terminate any user processes or logins
# into the node(s)
sub CleanUpProcesses {
my $AVOID_UIDS;
my $_userid;
my $tpid;
my $tppid;
my $tcpu;
my $tstime;
my $ptty;
my $ttime;
my $tcmd;
my @TERM_USER;
my @TEMP;
my $USER;
my $CMD_LINE;
my $RETURN;
$AVOID_UIDS="root|100|101|nobody|bin|ntp|UID|daemon|postfix|vtunesag";
open (PS_CMD, "-|") || exec 'ps -ef';
while () {
chomp;
($_userid, $tpid, $tppid, $tcpu, $tstime, $ptty, $ttime, $tcmd) = split (/\s+/, $_);
if ( $_userid !~ m/^${AVOID_UIDS}/ ) {
if ( $_userid =~ /^[0-9]/ ) {
$_userid=`ypcat passwd | egrep ${_userid} | cut -d \":\" -f 1`;
chomp $_userid;
}
print "$CALL_SCPT $host: Found leftover processes $tcmd from $_userid terminating...\n" if $DEBUG;
$CMD_LINE="pkill -9 -u $_userid"; # Switched to pkill due to length of usernames.
$RETURN=`$CMD_LINE`;
}
}
close PS_CMD;
system("/root/oom_adj.user.pl");
}
# This subroutine will remove any temporary files created by MPI application under /tmp.
sub CleanUpTmp {
my $_filename;
my @TEMP;
my @TERM_FILE;
my $CMD_LINE;
my $RETURN;
my $_nofiles;
my $FILE;
open (LS_CMD, "-|") || exec 'ls /tmp';
while () {
chomp;
($_filename) = split (/\s+/, $_);
if ( $_filename =~ m/^mpd/ ) {
@TEMP=$_filename;
push @TERM_FILE, $TEMP[0];
}
elsif ( $_filename =~ m/^ib_pool/ ) {
@TEMP=$_filename;
push @TERM_FILE, $TEMP[0];
}
elsif ( $_filename =~ m/^ib_shmem/ ) {
@TEMP=$_filename;
push @TERM_FILE, $TEMP[0];
}
}
close LS_CMD;
foreach $FILE (@TERM_FILE) {
$CMD_LINE="rm -f /tmp/${FILE}";
$RETURN=`$CMD_LINE`;
}
$_nofiles = scalar @TERM_FILE;
if ($_nofiles ne 0) {
print "$CALL_SCPT $host: Found $_nofiles MPI temp files under /tmp. Removing...\n" if $DEBUG;
}
}
# Flush the Linux IO buffer cache and the slab cache using the bcfree command.
sub FreeBufferCache {
my $CMD_LINE;
my $RETURN;
my $BCFREE;
my $BCFREE_OPTS;
$BCFREE="/usr/bin/bcfree";
$BCFREE_OPTS="-a -s";
if (-e "${BCFREE}") {
$CMD_LINE="${BCFREE} ${BCFREE_OPTS}";
$RETURN=`$CMD_LINE`;
}
}
# This subroutine will remove any temporary files created by MPI application under /dev/shm.
sub CleanUpshm {
my $_filename;
my @TEMP;
my @TERM_FILE;
my $CMD_LINE;
my $RETURN;
my $_nofiles;
my $FILE;
open (LS_CMD, "-|") || exec 'ls /dev/shm';
while () {
chomp;
($_filename) = split (/\s+/, $_);
@TEMP=$_filename;
push @TERM_FILE, $TEMP[0];
}
close LS_CMD;
foreach $FILE (@TERM_FILE) {
if (${FILE} !~ m/sysconfig/) {
$CMD_LINE="rm -rf /dev/shm/${FILE}";
$RETURN=`$CMD_LINE`;
print "${RETURN}" if $DEBUG;
print "$CALL_SCPT $host: Found ${FILE} dir/file under /dev/shm. Removing it...\n" if $DEBUG;
}
}
}
sub chk_msr_state {
# Hyperthreading Assumption, if the first core has the bit set to enable/disable
# then it is assumed all other cores within the node have the same setting.
my $msr_lsmod=`lsmod | grep -c msr`; # 0=not loaded, 1=msr loaded
if ( $msr_lsmod == 0 ) {
print "Loading MSR Kernel Modules...\n";
`modprobe msr`; # we need the msr kernel modules loaded to read the msr values
sleep(1); # give time for the msr modules to load
}
}
sub enable_turbo_mode {
my $ncpus = `cat /proc/cpuinfo | grep processor | wc -l`;
my $i;
my $val;
my $nval;
chk_msr_state();
print "${host}: Enabling turbo mode...\n";
chomp($val = `$rdmsr -p 0 $msr`);
$val = hex("100000017");
$nval = $val ^ $tbit;
printf("${host}: Changing msr $msr on all cores from 0x%lx to 0x%lx\n", $val, $nval);
for ($i = 0; $i < $ncpus; $i++) {
`$wrmsr -p $i $msr $nval`;
}
load_system_services();
}
sub disable_turbo_mode {
my $ncpus = `cat /proc/cpuinfo | grep processor | wc -l`;
my $i;
my $val;
my $nval;
chk_msr_state();
print "${host}: Disabling turbo mode...\n";
chomp($val = `$rdmsr -p 0 $msr`);
$val = hex(16);
#$val = hex($val);
$nval = $val ^ $tbit;
printf("${host}: Changing msr $msr on all cores from 0x%lx to 0x%lx\n", $val, $nval);
for ($i = 0; $i < $ncpus; $i++) {
`$wrmsr -p $i $msr $nval`;
}
}
sub load_system_services {
my $powersave_loaded=`ps -ef | grep -v grep | grep -c power`;
if ($powersave_loaded == 0 ) {
print "${host}: Loading system services...\n";
system("(/etc/init.d/acpid start;/etc/init.d/powersaved start)&> /dev/null");
sleep(1);
system("/usr/bin/powersave -f");
}
else {
print "Powersaved already loaded.\n";
}
}
sub unload_system_services {
print "${host}: Unloading system services...\n";
system("(/etc/init.d/acpid stop;/etc/init.d/powersaved stop)&> /dev/null");
}
sub run_cleanup {
&CleanUpshm();
&CleanUpTmp();
&CleanUpIPC_Table();
&CleanUpProcesses();
&CleanUpProcesses();
}
sub set_processor_speed {
my $freq=shift;
my $ncpus = `cat /proc/cpuinfo | grep processor | wc -l`;
my $i;
my $file;
load_system_services();
$freq = $freq * 1000;
printf("${host}: Setting Proc Core speed to: %.3f GHz\n",($freq/1000000)) ;
for ($i = 0; $i < $ncpus; $i++) {
$file = "/sys/devices/system/cpu/cpu" . $i . "/cpufreq/scaling_min_freq";
open FILE1, ">", $file or die $!;
print FILE1 "$freq\n";
close FILE1;
$file = "/sys/devices/system/cpu/cpu" . $i . "/cpufreq/scaling_max_freq";
open FILE2, ">", $file or die $!;
print FILE2 "$freq\n";
close FILE2;
}
}
#
#print "$prefix_option\n";
#print "$queue_destination\n";
#
# if ( $queue_destination =~ /^f/ ) {
# my $b=0;
# ($a,$set_freq) = split (/f/, $queue_destination);
# set_processor_speed($set_freq);
# }
# Don't run on systems with earlier than Nehalem processors
# Based on the prefix_option set turbo mode accordingly and run node cleanup routines.
#if( $prefix_option =~ m/TB/ ){
#enable_turbo_mode();
#run_cleanup();
#}
#elsif ( $prefix_option =~ m/JT/ ) {
#print "* * * * ENABLE TURBO and bypass PBS Prologue and Epilogue scripts * * * *\n";
#enable_turbo_mode();
#}
#elsif ( $prefix_option =~ m/NT/ ) {
#print "* * * * Bypassing the Turbo checks and run just node clean-up * * * *\n";
#run_cleanup();
#}
#elsif ( $queue_destination =~ /^f/ ) {
#my $b=0;
#($a,$set_freq) = split (/f/, $queue_destination);
#set_processor_speed($set_freq);
#}
#elsif ( $queue_destination =~ /^reset/ ) {
#set_processor_speed(2934);
#disable_turbo_mode();
#unload_system_services();
#run_cleanup();
#}
#else {
#disable_turbo_mode();
#unload_system_services();
#run_cleanup();
#}
run_cleanup(); |