Appendix A. Out of Memory Adjustment

This section describes sample set of out of memory OOM adjust scripts for cron and PBS prologue and epilogue.

Example A-1. oom_adj.user.pl.txt : OOM Adjustment Script

#!/usr/bin/perl
use strict;

use Sys::Hostname;
my $host = hostname();
my $DEBUG=0; # 0=turn off, 1=turn on
my $CALL_SCPT=$ARGV[0];

sub ResetOomAdj {
my $AVOID_UIDS;
my $_userid;
my $tpid;
my $CMD_LINE;
my $RETURN;

$AVOID_UIDS="root|100|nobody|ntp|USER|daemon|postfix|vtunesag";
   open (PS_CMD, "-|") || exec 'ps -e -o user,pid';
   while (<PS_CMD>) {
     chomp;
     ($_userid, $tpid) = split (/\s+/, $_);

       if ( $_userid !~ m/^${AVOID_UIDS}/ && $tpid =~ /^[0-9]/ && -e
"/proc/$tpid/oom_adj" ) {
          print "$CALL_SCPT $host: Found processes to set to zero
oom_adj...\n" if $DEBUG;
          $CMD_LINE="echo 0 > /proc/$tpid/oom_adj";
          $RETURN=`$CMD_LINE`;
       }
       elsif ( $tpid =~ /^[0-9]/ && -e "/proc/$tpid/oom_adj" ) {
          print "$CALL_SCPT $host: Found processes to set to protect
oom_adj...\n" if $DEBUG;
          $CMD_LINE="echo -17 > /proc/$tpid/oom_adj";
          $RETURN=`$CMD_LINE`;
       }
   }
   close PS_CMD;

}

&ResetOomAdj();


Example A-2. cronentry: Sample cron Entry for oom_adj Script

-*/2 *   * * *     /root/oom_adj.user.pl




Example A-3. prologue: Sample prologue Script

#!/bin/bash
##################################################################################
#
#  Version: 2.3.1 : Updated 8/12/09
#  Date: Oct 16, 2007
#  Author: Scott Shaw, sshaw@sgi.com
#
#  Script Name: PBS Pro Prologue Script
#  The purpose of the Prologue script is to terminate leftover user processes and 
#  allocated IPCs resources. The prologue script consists of two scripts, the main 
#  prologue script and a chk_node.pl script.  To minimize accessing each node the 
#  prologue script executes a parallel ssh shell across a set of nodes based on the 
#  PBS_NODEFILE.  For large clusters over 64 nodes serial ssh access is slow so having 
#  a flexible parallel ssh to help speed up the clean-up process of each node.  In 
#  some cases, a PBS jobs can normally terminate but some MPI implementations do not 
#  normally terminate the MPI processes due to crappy error code handling or 
#  segmentation faults within the MPI application thus leaving behind user processes 
#  still consuming system resources. 
#  
#  When the prologue script is launched by PBS MOM the ssh session is executed and will 
#  execute the chk_node.pl script. The chk_node.pl script contains a series of clean-up 
#  commands which are executed on each node based on the PBS_NODEFILE.
#
#  Execution of the prologue script is based on the root account.
#
#  This script needs to reside on each execution host/node
#  Location: /var/spool/PBS/mom_priv
#  File name: prologue
#  Permissions: 755
#  Owner: root
#  Group: root
#
#  ls output: ls -l /var/spool/PBS/mom_priv/prologue
#       -rwxr-xr-x 1 root root 2054 Sep  6 19:39 /var/spool/PBS/mom_priv/prologue
#
#  Modification of the prologalarm maybe necessay if the network access is slow to
#  each node.  30 seconds may not be enough time to check 256 nodes in a cluster. 
#  prologalarm #  Defines the maximum number of seconds the prologue
#  and prologue may run before timing out. Default:
#  30. Integer. Example:
#  $prologalarm 30
#
##################################################################################


JOBID=$1
USERNAME=$2
GROUPNAME=$3
JOBNAME=$4
P_PID=$5
NPCUS=$6
CPU_PERCENT=$7
QUEUE=$8
TTY_TYPE=$9
UNKNOWN_ARG=$10
VERSION="v2.3.1"

SSHOPTS="-o StrictHostKeyChecking=no -o ConnectTimeout=6"

# If the cluster blade layout is not in sequentially than use a flat file.
NODES_FILE="/var/spool/PBS/aux/${JOBID}";

spawn ()
{
        if [[ `jobs | grep  -v Done | wc -l` -ge $1 ]]; then
                wait
        fi
        shift
        $@  &
}

exec_cmd ()
{
   for HOSTNAME in $( cat ${NODES_FILE} | sort -u )
   do
      spawn 25 ssh ${SSHOPTS} ${HOSTNAME} $CMDLINE
   done
   wait
}

# main()
#Find PBS qstat command
if [ -f /usr/pbs/bin/qstat ]; then
   QSTAT=/usr/pbs/bin/qstat

elif [ -f /opt/pbs/default/bin/qstat ]; then
   QSTAT=/opt/pbs/default/bin/qstat

else
  echo "Epilogue Error: The qstat command could not be detected, exiting..."
  exit 1
fi


prefix_flag=`${QSTAT} -a ${JOBID} | grep "^[0-9]" |awk '{print $4}' | awk -F. '{print $1}'`
queue=`${QSTAT} -a ${JOBID} | grep "^[0-9]" |awk '{print $3}'`

   echo "Start Prologue ${VERSION} `date` "

   if [ $( /bin/uname -m ) = "x86_64" ]; then
         echo "Prefix passed: ${prefix_flag}"
         echo "destination queue: ${queue}"
         
         case $prefix_flag in
            TB) 
              # Enable turbo and do node cleanup
              CMDLINE="/var/spool/PBS/mom_priv/chk_node.pl Plog ${queue} TB"
              exec_cmd
              ;;
            BP) 
              # Bypass the turbo setting and P/Elog cleanup
              echo "* * * * Bypassing the PBS Prologue and Epilogue scripts * * * *"
              ;;
            JT) 
              # Enable turbo but do not run the node cleanup p/elog scripts
              CMDLINE="/var/spool/PBS/mom_priv/chk_node.pl Plog ${queue} JT"
              exec_cmd
              ;;
            NT) 
              # bypass turbo settings but run the node cleanup
              CMDLINE="/var/spool/PBS/mom_priv/chk_node.pl Plog ${queue} NT"
              exec_cmd
              ;;
             *)
              # disable turbo and run the node cleanup scripts
              CMDLINE="/var/spool/PBS/mom_priv/chk_node.pl Plog ${queue}"
              exec_cmd
          esac
   else
      echo "The prologue script is intended to run on x86_64 nodes not `uname -m`."
      echo "End Prologue ${VERSION} `date` "
      exit -1
   fi
   echo "End Prologue ${VERSION} `date` "


#Output the cluster details file
   if [ -f /var/spool/PBS/mom_priv/cluster_info.out ]; then
      cat /var/spool/PBS/mom_priv/cluster_info.out
   else
      echo "WARNING: The cluster info file does not exist. Contact hpc_support and report this warning."
   fi


Example A-4. epilogue: Sample epilogue Script

#!/bin/bash
##################################################################################
#
#  Version: 2.3.1 : Updated 8/12/09
#  Date: Oct 16, 2007
#  Author: Scott Shaw, sshaw@sgi.com
#
#  Script Name: PBS Pro Epilogue Script
#  The purpose of the epilogue script is to terminate leftover user processes and 
#  allocated IPCs resources. The epilogue script consists of two scripts, the main 
#  epilogue script and a chk_node.pl script.  To minimize accessing each node the 
#  epilogue script executes a parallel ssh shell across a set of nodes based on the 
#  PBS_NODEFILE.  For large clusters over 64 nodes serial ssh access is slow so having 
#  a flexible parallel ssh to help speed up the clean-up process of each node.  In 
#  some cases, a PBS jobs can normally terminate but some MPI implementations do not 
#  normally terminate the MPI processes due to crappy error code handling or 
#  segmentation faults within the MPI application thus leaving behind user processes 
#  still consuming system resources. 
#  
#  When the epilogue script is launched by PBS MOM the ssh session is executed and will 
#  execute the chk_node.pl script. The chk_node.pl script contains a series of clean-up 
#  commands which are executed on each node based on the PBS_NODEFILE.
#
#  Execution of the epilouge script is based on the root account.
#
#  This script needs to reside on each execution host/node
#  Location: /var/spool/PBS/mom_priv
#  File name: epilogue
#  Permissions: 755
#  Owner: root
#  Group: root
#
#  ls output: ls -l /var/spool/PBS/mom_priv/epilogue
#       -rwxr-xr-x 1 root root 2054 Sep  6 19:39 /var/spool/PBS/mom_priv/epilogue
#
#  Modification of the prologalarm maybe necessay if the network access is slow to
#  each node.  30 seconds may not be enough time to check 256 nodes in a cluster. 
#  prologalarm #  Defines the maximum number of seconds the prologue
#  and epilogue may run before timing out. Default:
#  30. Integer. Example:
#  $prologalarm 30
#
##################################################################################


JOBID=$1
USERNAME=$2
GROUPNAME=$3
JOBNAME=$4
P_PID=$5
NPCUS=$6
CPU_PERCENT=$7
QUEUE=$8
TTY_TYPE=$9
UNKNOWN_ARG=$10
VERSION="v2.3.1"

SSHOPTS="-o StrictHostKeyChecking=no -o ConnectTimeout=6"

# If the cluster blade layout is not in sequentially than use a flat file.
NODES_FILE="/var/spool/PBS/aux/${JOBID}";

spawn ()
{
        if [[ `jobs | grep  -v Done | wc -l` -ge $1 ]]; then
                wait
        fi
        shift
        $@  &
}

exec_cmd ()
{
   for HOSTNAME in $( cat ${NODES_FILE} | sort -u )
   do
      spawn 25 ssh ${SSHOPTS} ${HOSTNAME} $CMDLINE
   done
   wait
}

# main()
#Find PBS qstat command
if [ -f /usr/pbs/bin/qstat ]; then
   QSTAT=/usr/pbs/bin/qstat

elif [ -f /opt/pbs/default/bin/qstat ]; then
   QSTAT=/opt/pbs/default/bin/qstat

else
  echo "Epilogue Error: The qstat command could not be detected, exiting..."
  exit 1
fi
 



prefix_flag=`${QSTAT} -a ${JOBID} | grep "^[0-9]" |awk '{print $4}' | awk -F. '{print $1}'`
queue=`${QSTAT} -a ${JOBID} | grep "^[0-9]" |awk '{print $3}'`

   echo "Start Epilogue ${VERSION} `date` "
   if [ $( /bin/uname -m ) = "x86_64" ]; then
         echo "Prefix passed: ${prefix_flag}"
         echo "destination queue: ${queue}"

         case $prefix_flag in
            TB)
              # Enable turbo and do node cleanup
              CMDLINE="/var/spool/PBS/mom_priv/chk_node.pl Elog reset"
              exec_cmd
              ;;
            BP)
              # Bypass the turbo setting and P/Elog cleanup
              echo "* * * * Bypassing the PBS Prologue and Epilogue scripts * * * *"
              ;;
            JT)
              # Enable turbo but do not run the node cleanup p/elog scripts
              CMDLINE="/var/spool/PBS/mom_priv/chk_node.pl Elog reset JT"
              exec_cmd
              ;;
            NT)
              # bypass turbo settings but run the node cleanup
              CMDLINE="/var/spool/PBS/mom_priv/chk_node.pl Elog noreset NT"
              exec_cmd
              ;;
             *)
              # disable turbo and run the node cleanup scripts
              CMDLINE="/var/spool/PBS/mom_priv/chk_node.pl Elog reset"
              exec_cmd
          esac
else
   echo "The epilogue script is intended to run on x86_64 nodes not `uname -m`."
   echo "End Epilogue ${VERSION} `date` "

   exit -1
fi
   echo "End Epilogue ${VERSION} `date` "


Example A-5. chk_node.pl.txt : Script epilogue and prologue Use.

#!/usr/bin/perl 
#  Version: 2.3.1 : Updated 8/12/09
#  Orig Date: Oct 10, 2007
#  Author: Scott Shaw, sshaw@sgi.com
# 
#  This perl script is called by PBS Pro prologue and epilogue scripts when 
#  a user submits a job through PBS Pro.  The purpose of this script is to 
#  sanitize a range of nodes identified by the $PBS_NODEFILE list by
#  terminating old user processes, old ipc allocations, temp files,
#  and to flush the system buffer cache.
#
#  Changes:
#   2/1/08 sshaw@sgi.com
#          - Added a subroutine to clean-up /tmp directory
#          - changed system() to exec since it was corrupting memory
#          - declared all vars to be local to subroutine, before it was loosely defined
#          - added strict checking of perl script  
#   3/24/08 sshaw@sgi.com
#          - fixed debug conditional
#          - cleaned up the CleanUpProcesses procedure and added which processes
#            and user being terminated.
#          - Changed the killall to pkill due to userid > 8 chars
#   11/13/08 sshaw@sgi.com
#          - added a subroutine to clean-up /dev/shm since several users
#            use this location for temporary scratch space.
#   03/31/09 sshaw@sgi.com
#          - added subroutines to enable/disable Turbo mode on Intel series 5500 CPUs
#   04/22/09 sshaw@sgi.com
#          - added subroutines to speed step the core processor frequency to a lower freq
#   08/12/09 sshaw@sgi.com
#          - fixed minor issues with setting the frequency and fixed cpu freq to max speed


use strict;

use Sys::Hostname;
my $host = hostname();
my $DEBUG=1; # 0=turn off, 1=turn on
my $CALL_SCPT=$ARGV[0];
my $queue_destination=$ARGV[1];
my $prefix_option=$ARGV[2];
my $set_freq=0;

#####
# The following lines are added for Turbo/SMT mode starting with Intel 5500 series CPUs
my $rdmsr = "/var/spool/PBS/mom_priv/rdmsr";
my $wrmsr = "/var/spool/PBS/mom_priv/wrmsr";
my $msr = "0x199";
my $tbit = 1 <<  32;


# Several MPI implementations or MPI applications use IPC shared memory.  When
# a MPI application abnormally terminates it leaves behind allocated resources.
# this subroutine will remove any IPC resources allocated for the user's job.
sub CleanUpIPC_Table {
my $tkey;
my $tshmid;
my $towner;
my $tperms;
my $tbytes;
my $tnattch;
my $tstatus;
my $CMD_LINE;
my $RETURN;

open(IPC_SHARMEM, "-|") || exec 'ipcs -m';
   while () {
   chomp;
   ($tkey, $tshmid, $towner, $tperms, $tbytes, $tnattch, $tstatus) = split (/\s+/, $_);
        if ( $tkey =~ /^[0-9]/ ) {
           if ( $towner !~ m/root|^ / ) {
           print "$CALL_SCPT $host: Found IPC_SHR_MEM allocation: $tshmid $towner, terminating...\n" if $DEBUG;
            $CMD_LINE="ipcrm -m $tshmid";
            $RETURN=`$CMD_LINE`;
           }
        }
   }
   close IPC_SHARMEM;
}

# This subroutine will parse the process list and terminate any user processes or logins 
# into the node(s)
sub CleanUpProcesses {
my $AVOID_UIDS;
my $_userid;
my $tpid;
my $tppid;
my $tcpu;
my $tstime;
my $ptty;
my $ttime;
my $tcmd;
my @TERM_USER;
my @TEMP;
my $USER;
my $CMD_LINE;
my $RETURN;

$AVOID_UIDS="root|100|101|nobody|bin|ntp|UID|daemon|postfix|vtunesag";
   open (PS_CMD, "-|") || exec 'ps -ef';
   while () {
     chomp;
     ($_userid, $tpid, $tppid, $tcpu, $tstime, $ptty, $ttime, $tcmd) = split (/\s+/, $_);

       if ( $_userid !~ m/^${AVOID_UIDS}/ ) {
          if ( $_userid =~ /^[0-9]/ ) {
             $_userid=`ypcat passwd | egrep ${_userid} | cut -d \":\" -f 1`;
             chomp $_userid;
          }
          print "$CALL_SCPT $host: Found leftover processes $tcmd from $_userid terminating...\n" if $DEBUG;
          $CMD_LINE="pkill -9 -u $_userid";   # Switched to pkill due to length of usernames.
          $RETURN=`$CMD_LINE`;
       }
   }
   close PS_CMD;
   system("/root/oom_adj.user.pl");
}
# This subroutine will remove any temporary files created by MPI application under /tmp.
sub CleanUpTmp {
my $_filename;
my @TEMP;
my @TERM_FILE;
my $CMD_LINE;
my $RETURN;
my $_nofiles;
my $FILE;


   open (LS_CMD, "-|") || exec 'ls /tmp';
   while () {
      chomp;
      ($_filename) = split (/\s+/, $_);
      if ( $_filename =~ m/^mpd/ ) {
         @TEMP=$_filename;
         push @TERM_FILE, $TEMP[0];
      }
      elsif ( $_filename =~ m/^ib_pool/ ) {
         @TEMP=$_filename;
         push @TERM_FILE, $TEMP[0];
         }
      elsif ( $_filename =~ m/^ib_shmem/ ) {
         @TEMP=$_filename;
         push @TERM_FILE, $TEMP[0];
         }
   }
   close LS_CMD;

   foreach $FILE (@TERM_FILE) {
      $CMD_LINE="rm -f /tmp/${FILE}";
      $RETURN=`$CMD_LINE`;
   }

   $_nofiles = scalar @TERM_FILE;
      if ($_nofiles ne 0) {
          print "$CALL_SCPT $host: Found $_nofiles MPI temp files under /tmp. Removing...\n" if $DEBUG;
      }
}

# Flush the Linux IO buffer cache and the slab cache using the  bcfree command.
sub FreeBufferCache {
my $CMD_LINE;
my $RETURN;
my $BCFREE;
my $BCFREE_OPTS;

$BCFREE="/usr/bin/bcfree";
$BCFREE_OPTS="-a -s";

   if (-e "${BCFREE}") {
      $CMD_LINE="${BCFREE} ${BCFREE_OPTS}";
      $RETURN=`$CMD_LINE`;
   }
}

# This subroutine will remove any temporary files created by MPI application under /dev/shm.
sub CleanUpshm {
my $_filename;
my @TEMP;
my @TERM_FILE;
my $CMD_LINE;
my $RETURN;
my $_nofiles;
my $FILE;


   open (LS_CMD, "-|") || exec 'ls /dev/shm';
   while () {
      chomp;
      ($_filename) = split (/\s+/, $_);
         @TEMP=$_filename;
         push @TERM_FILE, $TEMP[0];
   }
   close LS_CMD;

   foreach $FILE (@TERM_FILE) {
      if (${FILE} !~ m/sysconfig/) {
         $CMD_LINE="rm -rf /dev/shm/${FILE}";
         $RETURN=`$CMD_LINE`;
         print "${RETURN}" if $DEBUG;
         print "$CALL_SCPT $host: Found ${FILE} dir/file under /dev/shm. Removing it...\n" if $DEBUG;
      }
   }

}

sub chk_msr_state {
# Hyperthreading Assumption, if the first core has the bit set to enable/disable
# then it is assumed all other cores within the node have the same setting.

my $msr_lsmod=`lsmod | grep -c msr`;    # 0=not loaded, 1=msr loaded

   if ( $msr_lsmod == 0 ) {
      print "Loading MSR Kernel Modules...\n";
      `modprobe msr`;  # we need the msr kernel modules loaded to read the msr values
       sleep(1);  # give time for the msr modules to load
   }
}

sub enable_turbo_mode {
my $ncpus = `cat /proc/cpuinfo | grep processor | wc -l`;
my $i;
my $val;
my $nval;
        chk_msr_state();
        print "${host}: Enabling turbo mode...\n";
        chomp($val = `$rdmsr -p 0  $msr`);
        $val = hex("100000017");
        $nval = $val ^ $tbit;
        printf("${host}: Changing msr $msr on all cores from 0x%lx to 0x%lx\n", $val, $nval);
        for ($i = 0; $i < $ncpus; $i++) {
           `$wrmsr -p $i $msr $nval`;
        }
        load_system_services();
}

sub disable_turbo_mode {
my $ncpus = `cat /proc/cpuinfo | grep processor | wc -l`;
my $i;
my $val;
my $nval;
        chk_msr_state();
        print "${host}: Disabling turbo mode...\n";
        chomp($val = `$rdmsr -p 0  $msr`);
        $val = hex(16);
        #$val = hex($val);
        $nval = $val ^ $tbit;
        printf("${host}: Changing msr $msr on all cores from 0x%lx to 0x%lx\n", $val, $nval);
        for ($i = 0; $i < $ncpus; $i++) {
           `$wrmsr -p $i $msr $nval`;
        }
}


sub load_system_services {
my $powersave_loaded=`ps -ef | grep -v grep | grep -c power`;

   if ($powersave_loaded == 0 ) {
      print "${host}: Loading system services...\n";
      system("(/etc/init.d/acpid start;/etc/init.d/powersaved start)&> /dev/null");
      sleep(1);
      system("/usr/bin/powersave -f");
   }
   else {
      print "Powersaved already loaded.\n";
   }
}

sub unload_system_services {
   print "${host}: Unloading system services...\n";
   system("(/etc/init.d/acpid stop;/etc/init.d/powersaved stop)&> /dev/null");
}

sub run_cleanup {
    &CleanUpshm();
    &CleanUpTmp();
    &CleanUpIPC_Table();
    &CleanUpProcesses();
    &CleanUpProcesses();
}

sub set_processor_speed {
my $freq=shift;
my $ncpus = `cat /proc/cpuinfo | grep processor | wc -l`;
my $i;
my $file;
     load_system_services();
     $freq = $freq * 1000;
     printf("${host}: Setting Proc Core speed to: %.3f GHz\n",($freq/1000000)) ;
     for ($i = 0; $i < $ncpus; $i++) {
       $file = "/sys/devices/system/cpu/cpu" . $i . "/cpufreq/scaling_min_freq";
       open FILE1, ">", $file or die $!;
          print FILE1 "$freq\n";
       close FILE1;

       $file = "/sys/devices/system/cpu/cpu" . $i . "/cpufreq/scaling_max_freq";
       open FILE2, ">", $file or die $!;
          print FILE2 "$freq\n";
       close FILE2;
     }
}


# 
#print "$prefix_option\n";
#print "$queue_destination\n";
#
#   if ( $queue_destination =~ /^f/ ) {
#      my $b=0;
#      ($a,$set_freq) = split (/f/, $queue_destination);
#      set_processor_speed($set_freq);
#   }


# Don't run on systems with earlier than Nehalem processors
# Based on the prefix_option set turbo mode accordingly and run node cleanup routines.
   #if( $prefix_option =~ m/TB/ ){
        #enable_turbo_mode(); 
        #run_cleanup();
   #}
   #elsif ( $prefix_option =~ m/JT/ ) {
      #print "* * * * ENABLE TURBO and bypass PBS Prologue and Epilogue scripts * * * *\n";
      #enable_turbo_mode();
   #}
   #elsif ( $prefix_option =~ m/NT/ ) {
      #print "* * * * Bypassing the Turbo checks and run just node clean-up * * * *\n";
      #run_cleanup();
   #}
   #elsif ( $queue_destination =~ /^f/ ) {
      #my $b=0;
      #($a,$set_freq) = split (/f/, $queue_destination);
      #set_processor_speed($set_freq);
   #}
   #elsif ( $queue_destination =~ /^reset/ ) {
         #set_processor_speed(2934);
         #disable_turbo_mode();
         #unload_system_services();
         #run_cleanup();
     #}
   #else {
      #disable_turbo_mode();
      #unload_system_services();
      #run_cleanup();
   #}

run_cleanup();