When a cluster is shutdown correctly it is very easy to restart.
qstat -f', if all the nodes are responding move to the next step. If any nodes are flagged 'adu' or similar you will have to pxe-boot those nodes.File name cluster_test.sh
#!/bin/sh# This script tests the Loren cluster after a shutdown# To run the script type 'sh cluster_test.sh'.VASP=vasp_stdMPIRUN=mpiexec.hydra#create a directory and input filesmkdir ~/testcd ~/testcat > INCAR <<!SYSTEM = O atom in a boxISMEAR = 0!cat > POSCAR <<!O atom in a box1.0 8.0 0.0 0.0 0.0 8.0 0.0 0.0 0.0 8.0 1 cart 0 0 0!cat > KPOINTS <<!Gamma-point only1 rec 0 0 0 1 !cp /share/apps/vasp/POTENTIALS/GGA-PBE/O/POTCAR .#set up the test for the different queuesmpi=16nodes=32i=0# test each queuefor j in 1 2 5 11 13 14 15 16 17 18 19 21 22 23 24 25 27 28 29 31 do mkdir ~/test/$i-$j cp INCAR KPOINTS POSCAR POTCAR $i-$j cat > $i-$j/qvasp <<!#!/bin/bash#$ -N test_$i-$j#$ -cwd#$ -pe mpi $mpi#$ -S /bin/bash#$ -q all.q@compute-$i-$j.local#$ -e vasp.err#$ -o vasp.outmodule purgemodule load compilers/intel_16.0.3 apps/vasp_5.4.1$MPIRUN $VASP > job.log! cd $i-$j qsub qvasp cd ../done# Have user check if jobs are finisheduser_input=0until [ $user_input -eq "1" ] do qstat -f read -r -p "Have all test jobs finished? [Y/N] " response if [[ "$response" =~ ^([yY][eE][sS]|[yY])+$ ]] then user_input=1 fi done# Once all jobs have finished print out the error filesecho "Print out VASP error files:"i=0# test each queuefor j in 1 2 5 11 13 14 15 16 17 18 19 21 22 23 24 25 27 28 29 31 do echo "compute-$i-$j.local" echo "$(<$i-$j/vasp.err)" | tail -2 doneprintf "If the only error is 'cannot remove tmp directory' the simulation was successful. \nIf the error file reads 'cannot find vasp.err' it is likely the node is not functioning and you will need to do a hard reboot. \nIf it cannot find the intel compliers or vasp_std binary you need to copy the modulefiles and compilers to the node (see the UF-FLAMES 'admin' site)."#!/bin/sh# This script tests the Hermes cluster after a shutdown# To run the script type 'sh cluster_test.sh'.#create a directory and input filesmkdir ~/testcd ~/testcat > INCAR <<!SYSTEM = O atom in a boxISMEAR = 0!cat > POSCAR <<!O atom in a box1.0 8.0 0.0 0.0 0.0 8.0 0.0 0.0 0.0 8.0 1 cart 0 0 0!cat > KPOINTS <<!Gamma-point only1 rec 0 0 0 1 !cp /opt/VASP-resources/Potentials/GGA/O/POTCAR .#set up the test for the different queuesmpi=8for i in {0..3} ;do if [ $i -eq 0 ] then nodes=8 name=all2 elif [ $i -eq 1 ] then nodes=24 name=all elif [ $i -eq 3 ] then nodes=20 name=eight else continue fi# test each queue for ((j=1;j<=nodes;j++)); do mkdir ~/test/$i-$j cp INCAR KPOINTS POSCAR POTCAR $i-$j if [ $i -eq 1 ] && [ $j -eq 11 ] then mpi=4 elif [ $i -eq 1 ] && [ $j -gt 22 ] then name=single mpi=8 elif [ $i -eq 0 ] then mpi=24 else mpi=8 fi cat > $i-$j/qvasp <<!#!/bin/bash#$ -N test_$i-$j#$ -cwd#$ -pe mpi $mpi#$ -S /bin/bash #$ -q $name.q@compute-$i-$j.local#$ -e vasp.err#$ -o vasp.out export LD_LIBRARY_PATH=/opt/intel/composer_xe_2011_sp1.8.273/mkl/lib/intel64 /opt/intel/openmpi-1.4.4/bin/mpirun /home/michele/bin/vasp5.3.5_openmpi_1.4.4 > job.log! cd $i-$j qsub qvasp cd ../ donedone# Have user check if jobs are finisheduser_input=0until [ $user_input -eq "1" ] do qstat -f read -r -p "Have all test jobs finished? [Y/N] " response if [[ "$response" =~ ^([yY][eE][sS]|[yY])+$ ]] then user_input=1 fi done# Once all jobs have finished print out the error filesecho "Print out VASP error files:"for i in {0..3} ;do if [ $i -eq 0 ] then nodes=8 name=all2 elif [ $i -eq 1 ] then nodes=24 name=all elif [ $i -eq 3 ] then nodes=20 name=eight else continue fi for ((j=1;j<=nodes;j++)); do echo "$name.q@compute-$i-$j.local" echo "$(<$i-$j/vasp.err)" donedoneprintf "If the error file reads 'cannot remove tmp directory' the simulation was successful. \nIf the error file reads 'cannot find vasp.err' the node is not functioning and you will need to do a hard reboot. \nIf it cannot find the intel compliers you need to fix a missing opt/intel directory (see the UF FLAMES 'admin' guide)."5. If any jobs failed check the error messages. If all nodes are working but you can't run the VASP test jobs you may need to copy the compilers and modulefiles to each node.