This section describes sample set of out of memory OOM adjust scripts for cron and PBS prologue and epilogue.
Example A-1. oom_adj.user.pl.txt : OOM Adjustment Script
#!/usr/bin/perl use strict; use Sys::Hostname; my $host = hostname(); my $DEBUG=0; # 0=turn off, 1=turn on my $CALL_SCPT=$ARGV[0]; sub ResetOomAdj { my $AVOID_UIDS; my $_userid; my $tpid; my $CMD_LINE; my $RETURN; $AVOID_UIDS="root|100|nobody|ntp|USER|daemon|postfix|vtunesag"; open (PS_CMD, "-|") || exec 'ps -e -o user,pid'; while (<PS_CMD>) { chomp; ($_userid, $tpid) = split (/\s+/, $_); if ( $_userid !~ m/^${AVOID_UIDS}/ && $tpid =~ /^[0-9]/ && -e "/proc/$tpid/oom_adj" ) { print "$CALL_SCPT $host: Found processes to set to zero oom_adj...\n" if $DEBUG; $CMD_LINE="echo 0 > /proc/$tpid/oom_adj"; $RETURN=`$CMD_LINE`; } elsif ( $tpid =~ /^[0-9]/ && -e "/proc/$tpid/oom_adj" ) { print "$CALL_SCPT $host: Found processes to set to protect oom_adj...\n" if $DEBUG; $CMD_LINE="echo -17 > /proc/$tpid/oom_adj"; $RETURN=`$CMD_LINE`; } } close PS_CMD; } &ResetOomAdj(); |
Example A-3. prologue: Sample prologue Script
#!/bin/bash ################################################################################## # # Version: 2.3.1 : Updated 8/12/09 # Date: Oct 16, 2007 # Author: Scott Shaw, sshaw@sgi.com # # Script Name: PBS Pro Prologue Script # The purpose of the Prologue script is to terminate leftover user processes and # allocated IPCs resources. The prologue script consists of two scripts, the main # prologue script and a chk_node.pl script. To minimize accessing each node the # prologue script executes a parallel ssh shell across a set of nodes based on the # PBS_NODEFILE. For large clusters over 64 nodes serial ssh access is slow so having # a flexible parallel ssh to help speed up the clean-up process of each node. In # some cases, a PBS jobs can normally terminate but some MPI implementations do not # normally terminate the MPI processes due to crappy error code handling or # segmentation faults within the MPI application thus leaving behind user processes # still consuming system resources. # # When the prologue script is launched by PBS MOM the ssh session is executed and will # execute the chk_node.pl script. The chk_node.pl script contains a series of clean-up # commands which are executed on each node based on the PBS_NODEFILE. # # Execution of the prologue script is based on the root account. # # This script needs to reside on each execution host/node # Location: /var/spool/PBS/mom_priv # File name: prologue # Permissions: 755 # Owner: root # Group: root # # ls output: ls -l /var/spool/PBS/mom_priv/prologue # -rwxr-xr-x 1 root root 2054 Sep 6 19:39 /var/spool/PBS/mom_priv/prologue # # Modification of the prologalarm maybe necessay if the network access is slow to # each node. 30 seconds may not be enough time to check 256 nodes in a cluster. # prologalarm # Defines the maximum number of seconds the prologue # and prologue may run before timing out. Default: # 30. Integer. Example: # $prologalarm 30 # ################################################################################## JOBID=$1 USERNAME=$2 GROUPNAME=$3 JOBNAME=$4 P_PID=$5 NPCUS=$6 CPU_PERCENT=$7 QUEUE=$8 TTY_TYPE=$9 UNKNOWN_ARG=$10 VERSION="v2.3.1" SSHOPTS="-o StrictHostKeyChecking=no -o ConnectTimeout=6" # If the cluster blade layout is not in sequentially than use a flat file. NODES_FILE="/var/spool/PBS/aux/${JOBID}"; spawn () { if [[ `jobs | grep -v Done | wc -l` -ge $1 ]]; then wait fi shift $@ & } exec_cmd () { for HOSTNAME in $( cat ${NODES_FILE} | sort -u ) do spawn 25 ssh ${SSHOPTS} ${HOSTNAME} $CMDLINE done wait } # main() #Find PBS qstat command if [ -f /usr/pbs/bin/qstat ]; then QSTAT=/usr/pbs/bin/qstat elif [ -f /opt/pbs/default/bin/qstat ]; then QSTAT=/opt/pbs/default/bin/qstat else echo "Epilogue Error: The qstat command could not be detected, exiting..." exit 1 fi prefix_flag=`${QSTAT} -a ${JOBID} | grep "^[0-9]" |awk '{print $4}' | awk -F. '{print $1}'` queue=`${QSTAT} -a ${JOBID} | grep "^[0-9]" |awk '{print $3}'` echo "Start Prologue ${VERSION} `date` " if [ $( /bin/uname -m ) = "x86_64" ]; then echo "Prefix passed: ${prefix_flag}" echo "destination queue: ${queue}" case $prefix_flag in TB) # Enable turbo and do node cleanup CMDLINE="/var/spool/PBS/mom_priv/chk_node.pl Plog ${queue} TB" exec_cmd ;; BP) # Bypass the turbo setting and P/Elog cleanup echo "* * * * Bypassing the PBS Prologue and Epilogue scripts * * * *" ;; JT) # Enable turbo but do not run the node cleanup p/elog scripts CMDLINE="/var/spool/PBS/mom_priv/chk_node.pl Plog ${queue} JT" exec_cmd ;; NT) # bypass turbo settings but run the node cleanup CMDLINE="/var/spool/PBS/mom_priv/chk_node.pl Plog ${queue} NT" exec_cmd ;; *) # disable turbo and run the node cleanup scripts CMDLINE="/var/spool/PBS/mom_priv/chk_node.pl Plog ${queue}" exec_cmd esac else echo "The prologue script is intended to run on x86_64 nodes not `uname -m`." echo "End Prologue ${VERSION} `date` " exit -1 fi echo "End Prologue ${VERSION} `date` " #Output the cluster details file if [ -f /var/spool/PBS/mom_priv/cluster_info.out ]; then cat /var/spool/PBS/mom_priv/cluster_info.out else echo "WARNING: The cluster info file does not exist. Contact hpc_support and report this warning." fi |
Example A-4. epilogue: Sample epilogue Script
#!/bin/bash ################################################################################## # # Version: 2.3.1 : Updated 8/12/09 # Date: Oct 16, 2007 # Author: Scott Shaw, sshaw@sgi.com # # Script Name: PBS Pro Epilogue Script # The purpose of the epilogue script is to terminate leftover user processes and # allocated IPCs resources. The epilogue script consists of two scripts, the main # epilogue script and a chk_node.pl script. To minimize accessing each node the # epilogue script executes a parallel ssh shell across a set of nodes based on the # PBS_NODEFILE. For large clusters over 64 nodes serial ssh access is slow so having # a flexible parallel ssh to help speed up the clean-up process of each node. In # some cases, a PBS jobs can normally terminate but some MPI implementations do not # normally terminate the MPI processes due to crappy error code handling or # segmentation faults within the MPI application thus leaving behind user processes # still consuming system resources. # # When the epilogue script is launched by PBS MOM the ssh session is executed and will # execute the chk_node.pl script. The chk_node.pl script contains a series of clean-up # commands which are executed on each node based on the PBS_NODEFILE. # # Execution of the epilouge script is based on the root account. # # This script needs to reside on each execution host/node # Location: /var/spool/PBS/mom_priv # File name: epilogue # Permissions: 755 # Owner: root # Group: root # # ls output: ls -l /var/spool/PBS/mom_priv/epilogue # -rwxr-xr-x 1 root root 2054 Sep 6 19:39 /var/spool/PBS/mom_priv/epilogue # # Modification of the prologalarm maybe necessay if the network access is slow to # each node. 30 seconds may not be enough time to check 256 nodes in a cluster. # prologalarm # Defines the maximum number of seconds the prologue # and epilogue may run before timing out. Default: # 30. Integer. Example: # $prologalarm 30 # ################################################################################## JOBID=$1 USERNAME=$2 GROUPNAME=$3 JOBNAME=$4 P_PID=$5 NPCUS=$6 CPU_PERCENT=$7 QUEUE=$8 TTY_TYPE=$9 UNKNOWN_ARG=$10 VERSION="v2.3.1" SSHOPTS="-o StrictHostKeyChecking=no -o ConnectTimeout=6" # If the cluster blade layout is not in sequentially than use a flat file. NODES_FILE="/var/spool/PBS/aux/${JOBID}"; spawn () { if [[ `jobs | grep -v Done | wc -l` -ge $1 ]]; then wait fi shift $@ & } exec_cmd () { for HOSTNAME in $( cat ${NODES_FILE} | sort -u ) do spawn 25 ssh ${SSHOPTS} ${HOSTNAME} $CMDLINE done wait } # main() #Find PBS qstat command if [ -f /usr/pbs/bin/qstat ]; then QSTAT=/usr/pbs/bin/qstat elif [ -f /opt/pbs/default/bin/qstat ]; then QSTAT=/opt/pbs/default/bin/qstat else echo "Epilogue Error: The qstat command could not be detected, exiting..." exit 1 fi prefix_flag=`${QSTAT} -a ${JOBID} | grep "^[0-9]" |awk '{print $4}' | awk -F. '{print $1}'` queue=`${QSTAT} -a ${JOBID} | grep "^[0-9]" |awk '{print $3}'` echo "Start Epilogue ${VERSION} `date` " if [ $( /bin/uname -m ) = "x86_64" ]; then echo "Prefix passed: ${prefix_flag}" echo "destination queue: ${queue}" case $prefix_flag in TB) # Enable turbo and do node cleanup CMDLINE="/var/spool/PBS/mom_priv/chk_node.pl Elog reset" exec_cmd ;; BP) # Bypass the turbo setting and P/Elog cleanup echo "* * * * Bypassing the PBS Prologue and Epilogue scripts * * * *" ;; JT) # Enable turbo but do not run the node cleanup p/elog scripts CMDLINE="/var/spool/PBS/mom_priv/chk_node.pl Elog reset JT" exec_cmd ;; NT) # bypass turbo settings but run the node cleanup CMDLINE="/var/spool/PBS/mom_priv/chk_node.pl Elog noreset NT" exec_cmd ;; *) # disable turbo and run the node cleanup scripts CMDLINE="/var/spool/PBS/mom_priv/chk_node.pl Elog reset" exec_cmd esac else echo "The epilogue script is intended to run on x86_64 nodes not `uname -m`." echo "End Epilogue ${VERSION} `date` " exit -1 fi echo "End Epilogue ${VERSION} `date` " |
Example A-5. chk_node.pl.txt : Script epilogue and prologue Use.
#!/usr/bin/perl # Version: 2.3.1 : Updated 8/12/09 # Orig Date: Oct 10, 2007 # Author: Scott Shaw, sshaw@sgi.com # # This perl script is called by PBS Pro prologue and epilogue scripts when # a user submits a job through PBS Pro. The purpose of this script is to # sanitize a range of nodes identified by the $PBS_NODEFILE list by # terminating old user processes, old ipc allocations, temp files, # and to flush the system buffer cache. # # Changes: # 2/1/08 sshaw@sgi.com # - Added a subroutine to clean-up /tmp directory # - changed system() to exec since it was corrupting memory # - declared all vars to be local to subroutine, before it was loosely defined # - added strict checking of perl script # 3/24/08 sshaw@sgi.com # - fixed debug conditional # - cleaned up the CleanUpProcesses procedure and added which processes # and user being terminated. # - Changed the killall to pkill due to userid > 8 chars # 11/13/08 sshaw@sgi.com # - added a subroutine to clean-up /dev/shm since several users # use this location for temporary scratch space. # 03/31/09 sshaw@sgi.com # - added subroutines to enable/disable Turbo mode on Intel series 5500 CPUs # 04/22/09 sshaw@sgi.com # - added subroutines to speed step the core processor frequency to a lower freq # 08/12/09 sshaw@sgi.com # - fixed minor issues with setting the frequency and fixed cpu freq to max speed use strict; use Sys::Hostname; my $host = hostname(); my $DEBUG=1; # 0=turn off, 1=turn on my $CALL_SCPT=$ARGV[0]; my $queue_destination=$ARGV[1]; my $prefix_option=$ARGV[2]; my $set_freq=0; ##### # The following lines are added for Turbo/SMT mode starting with Intel 5500 series CPUs my $rdmsr = "/var/spool/PBS/mom_priv/rdmsr"; my $wrmsr = "/var/spool/PBS/mom_priv/wrmsr"; my $msr = "0x199"; my $tbit = 1 << 32; # Several MPI implementations or MPI applications use IPC shared memory. When # a MPI application abnormally terminates it leaves behind allocated resources. # this subroutine will remove any IPC resources allocated for the user's job. sub CleanUpIPC_Table { my $tkey; my $tshmid; my $towner; my $tperms; my $tbytes; my $tnattch; my $tstatus; my $CMD_LINE; my $RETURN; open(IPC_SHARMEM, "-|") || exec 'ipcs -m'; while () { chomp; ($tkey, $tshmid, $towner, $tperms, $tbytes, $tnattch, $tstatus) = split (/\s+/, $_); if ( $tkey =~ /^[0-9]/ ) { if ( $towner !~ m/root|^ / ) { print "$CALL_SCPT $host: Found IPC_SHR_MEM allocation: $tshmid $towner, terminating...\n" if $DEBUG; $CMD_LINE="ipcrm -m $tshmid"; $RETURN=`$CMD_LINE`; } } } close IPC_SHARMEM; } # This subroutine will parse the process list and terminate any user processes or logins # into the node(s) sub CleanUpProcesses { my $AVOID_UIDS; my $_userid; my $tpid; my $tppid; my $tcpu; my $tstime; my $ptty; my $ttime; my $tcmd; my @TERM_USER; my @TEMP; my $USER; my $CMD_LINE; my $RETURN; $AVOID_UIDS="root|100|101|nobody|bin|ntp|UID|daemon|postfix|vtunesag"; open (PS_CMD, "-|") || exec 'ps -ef'; while () { chomp; ($_userid, $tpid, $tppid, $tcpu, $tstime, $ptty, $ttime, $tcmd) = split (/\s+/, $_); if ( $_userid !~ m/^${AVOID_UIDS}/ ) { if ( $_userid =~ /^[0-9]/ ) { $_userid=`ypcat passwd | egrep ${_userid} | cut -d \":\" -f 1`; chomp $_userid; } print "$CALL_SCPT $host: Found leftover processes $tcmd from $_userid terminating...\n" if $DEBUG; $CMD_LINE="pkill -9 -u $_userid"; # Switched to pkill due to length of usernames. $RETURN=`$CMD_LINE`; } } close PS_CMD; system("/root/oom_adj.user.pl"); } # This subroutine will remove any temporary files created by MPI application under /tmp. sub CleanUpTmp { my $_filename; my @TEMP; my @TERM_FILE; my $CMD_LINE; my $RETURN; my $_nofiles; my $FILE; open (LS_CMD, "-|") || exec 'ls /tmp'; while () { chomp; ($_filename) = split (/\s+/, $_); if ( $_filename =~ m/^mpd/ ) { @TEMP=$_filename; push @TERM_FILE, $TEMP[0]; } elsif ( $_filename =~ m/^ib_pool/ ) { @TEMP=$_filename; push @TERM_FILE, $TEMP[0]; } elsif ( $_filename =~ m/^ib_shmem/ ) { @TEMP=$_filename; push @TERM_FILE, $TEMP[0]; } } close LS_CMD; foreach $FILE (@TERM_FILE) { $CMD_LINE="rm -f /tmp/${FILE}"; $RETURN=`$CMD_LINE`; } $_nofiles = scalar @TERM_FILE; if ($_nofiles ne 0) { print "$CALL_SCPT $host: Found $_nofiles MPI temp files under /tmp. Removing...\n" if $DEBUG; } } # Flush the Linux IO buffer cache and the slab cache using the bcfree command. sub FreeBufferCache { my $CMD_LINE; my $RETURN; my $BCFREE; my $BCFREE_OPTS; $BCFREE="/usr/bin/bcfree"; $BCFREE_OPTS="-a -s"; if (-e "${BCFREE}") { $CMD_LINE="${BCFREE} ${BCFREE_OPTS}"; $RETURN=`$CMD_LINE`; } } # This subroutine will remove any temporary files created by MPI application under /dev/shm. sub CleanUpshm { my $_filename; my @TEMP; my @TERM_FILE; my $CMD_LINE; my $RETURN; my $_nofiles; my $FILE; open (LS_CMD, "-|") || exec 'ls /dev/shm'; while () { chomp; ($_filename) = split (/\s+/, $_); @TEMP=$_filename; push @TERM_FILE, $TEMP[0]; } close LS_CMD; foreach $FILE (@TERM_FILE) { if (${FILE} !~ m/sysconfig/) { $CMD_LINE="rm -rf /dev/shm/${FILE}"; $RETURN=`$CMD_LINE`; print "${RETURN}" if $DEBUG; print "$CALL_SCPT $host: Found ${FILE} dir/file under /dev/shm. Removing it...\n" if $DEBUG; } } } sub chk_msr_state { # Hyperthreading Assumption, if the first core has the bit set to enable/disable # then it is assumed all other cores within the node have the same setting. my $msr_lsmod=`lsmod | grep -c msr`; # 0=not loaded, 1=msr loaded if ( $msr_lsmod == 0 ) { print "Loading MSR Kernel Modules...\n"; `modprobe msr`; # we need the msr kernel modules loaded to read the msr values sleep(1); # give time for the msr modules to load } } sub enable_turbo_mode { my $ncpus = `cat /proc/cpuinfo | grep processor | wc -l`; my $i; my $val; my $nval; chk_msr_state(); print "${host}: Enabling turbo mode...\n"; chomp($val = `$rdmsr -p 0 $msr`); $val = hex("100000017"); $nval = $val ^ $tbit; printf("${host}: Changing msr $msr on all cores from 0x%lx to 0x%lx\n", $val, $nval); for ($i = 0; $i < $ncpus; $i++) { `$wrmsr -p $i $msr $nval`; } load_system_services(); } sub disable_turbo_mode { my $ncpus = `cat /proc/cpuinfo | grep processor | wc -l`; my $i; my $val; my $nval; chk_msr_state(); print "${host}: Disabling turbo mode...\n"; chomp($val = `$rdmsr -p 0 $msr`); $val = hex(16); #$val = hex($val); $nval = $val ^ $tbit; printf("${host}: Changing msr $msr on all cores from 0x%lx to 0x%lx\n", $val, $nval); for ($i = 0; $i < $ncpus; $i++) { `$wrmsr -p $i $msr $nval`; } } sub load_system_services { my $powersave_loaded=`ps -ef | grep -v grep | grep -c power`; if ($powersave_loaded == 0 ) { print "${host}: Loading system services...\n"; system("(/etc/init.d/acpid start;/etc/init.d/powersaved start)&> /dev/null"); sleep(1); system("/usr/bin/powersave -f"); } else { print "Powersaved already loaded.\n"; } } sub unload_system_services { print "${host}: Unloading system services...\n"; system("(/etc/init.d/acpid stop;/etc/init.d/powersaved stop)&> /dev/null"); } sub run_cleanup { &CleanUpshm(); &CleanUpTmp(); &CleanUpIPC_Table(); &CleanUpProcesses(); &CleanUpProcesses(); } sub set_processor_speed { my $freq=shift; my $ncpus = `cat /proc/cpuinfo | grep processor | wc -l`; my $i; my $file; load_system_services(); $freq = $freq * 1000; printf("${host}: Setting Proc Core speed to: %.3f GHz\n",($freq/1000000)) ; for ($i = 0; $i < $ncpus; $i++) { $file = "/sys/devices/system/cpu/cpu" . $i . "/cpufreq/scaling_min_freq"; open FILE1, ">", $file or die $!; print FILE1 "$freq\n"; close FILE1; $file = "/sys/devices/system/cpu/cpu" . $i . "/cpufreq/scaling_max_freq"; open FILE2, ">", $file or die $!; print FILE2 "$freq\n"; close FILE2; } } # #print "$prefix_option\n"; #print "$queue_destination\n"; # # if ( $queue_destination =~ /^f/ ) { # my $b=0; # ($a,$set_freq) = split (/f/, $queue_destination); # set_processor_speed($set_freq); # } # Don't run on systems with earlier than Nehalem processors # Based on the prefix_option set turbo mode accordingly and run node cleanup routines. #if( $prefix_option =~ m/TB/ ){ #enable_turbo_mode(); #run_cleanup(); #} #elsif ( $prefix_option =~ m/JT/ ) { #print "* * * * ENABLE TURBO and bypass PBS Prologue and Epilogue scripts * * * *\n"; #enable_turbo_mode(); #} #elsif ( $prefix_option =~ m/NT/ ) { #print "* * * * Bypassing the Turbo checks and run just node clean-up * * * *\n"; #run_cleanup(); #} #elsif ( $queue_destination =~ /^f/ ) { #my $b=0; #($a,$set_freq) = split (/f/, $queue_destination); #set_processor_speed($set_freq); #} #elsif ( $queue_destination =~ /^reset/ ) { #set_processor_speed(2934); #disable_turbo_mode(); #unload_system_services(); #run_cleanup(); #} #else { #disable_turbo_mode(); #unload_system_services(); #run_cleanup(); #} run_cleanup(); |