When a cluster is shutdown correctly it is very easy to restart.
qstat -f
', if all the nodes are responding move to the next step. If any nodes are flagged 'adu' or similar you will have to pxe-boot those nodes.File name cluster_test.sh
#!/bin/sh
# This script tests the Loren cluster after a shutdown
# To run the script type 'sh cluster_test.sh'.
VASP=vasp_std
MPIRUN=mpiexec.hydra
#create a directory and input files
mkdir ~/test
cd ~/test
cat > INCAR <<!
SYSTEM = O atom in a box
ISMEAR = 0
!
cat > POSCAR <<!
O atom in a box
1.0
8.0 0.0 0.0
0.0 8.0 0.0
0.0 0.0 8.0
1
cart
0 0 0
!
cat > KPOINTS <<!
Gamma-point only
1
rec
0 0 0 1
!
cp /share/apps/vasp/POTENTIALS/GGA-PBE/O/POTCAR .
#set up the test for the different queues
mpi=16
nodes=32
i=0
# test each queue
for j in 1 2 5 11 13 14 15 16 17 18 19 21 22 23 24 25 27 28 29 31
do
mkdir ~/test/$i-$j
cp INCAR KPOINTS POSCAR POTCAR $i-$j
cat > $i-$j/qvasp <<!
#!/bin/bash
#$ -N test_$i-$j
#$ -cwd
#$ -pe mpi $mpi
#$ -S /bin/bash
#$ -q all.q@compute-$i-$j.local
#$ -e vasp.err
#$ -o vasp.out
module purge
module load compilers/intel_16.0.3 apps/vasp_5.4.1
$MPIRUN $VASP > job.log
!
cd $i-$j
qsub qvasp
cd ../
done
# Have user check if jobs are finished
user_input=0
until [ $user_input -eq "1" ]
do
qstat -f
read -r -p "Have all test jobs finished? [Y/N] " response
if [[ "$response" =~ ^([yY][eE][sS]|[yY])+$ ]]
then
user_input=1
fi
done
# Once all jobs have finished print out the error files
echo "Print out VASP error files:"
i=0
# test each queue
for j in 1 2 5 11 13 14 15 16 17 18 19 21 22 23 24 25 27 28 29 31
do
echo "compute-$i-$j.local"
echo "$(<$i-$j/vasp.err)" | tail -2
done
printf "If the only error is 'cannot remove tmp directory' the simulation was successful.
\nIf the error file reads 'cannot find vasp.err' it is likely the node is not functioning and you will need to do a hard reboot.
\nIf it cannot find the intel compliers or vasp_std binary you need to copy the modulefiles and compilers to the node (see the UF-FLAMES 'admin' site)."
#!/bin/sh
# This script tests the Hermes cluster after a shutdown
# To run the script type 'sh cluster_test.sh'.
#create a directory and input files
mkdir ~/test
cd ~/test
cat > INCAR <<!
SYSTEM = O atom in a box
ISMEAR = 0
!
cat > POSCAR <<!
O atom in a box
1.0
8.0 0.0 0.0
0.0 8.0 0.0
0.0 0.0 8.0
1
cart
0 0 0
!
cat > KPOINTS <<!
Gamma-point only
1
rec
0 0 0 1
!
cp /opt/VASP-resources/Potentials/GGA/O/POTCAR .
#set up the test for the different queues
mpi=8
for i in {0..3} ;
do
if [ $i -eq 0 ]
then
nodes=8
name=all2
elif [ $i -eq 1 ]
then
nodes=24
name=all
elif [ $i -eq 3 ]
then
nodes=20
name=eight
else
continue
fi
# test each queue
for ((j=1;j<=nodes;j++));
do
mkdir ~/test/$i-$j
cp INCAR KPOINTS POSCAR POTCAR $i-$j
if [ $i -eq 1 ] && [ $j -eq 11 ]
then
mpi=4
elif [ $i -eq 1 ] && [ $j -gt 22 ]
then
name=single
mpi=8
elif [ $i -eq 0 ]
then
mpi=24
else
mpi=8
fi
cat > $i-$j/qvasp <<!
#!/bin/bash
#$ -N test_$i-$j
#$ -cwd
#$ -pe mpi $mpi
#$ -S /bin/bash
#$ -q $name.q@compute-$i-$j.local
#$ -e vasp.err
#$ -o vasp.out
export LD_LIBRARY_PATH=/opt/intel/composer_xe_2011_sp1.8.273/mkl/lib/intel64
/opt/intel/openmpi-1.4.4/bin/mpirun /home/michele/bin/vasp5.3.5_openmpi_1.4.4 > job.log
!
cd $i-$j
qsub qvasp
cd ../
done
done
# Have user check if jobs are finished
user_input=0
until [ $user_input -eq "1" ]
do
qstat -f
read -r -p "Have all test jobs finished? [Y/N] " response
if [[ "$response" =~ ^([yY][eE][sS]|[yY])+$ ]]
then
user_input=1
fi
done
# Once all jobs have finished print out the error files
echo "Print out VASP error files:"
for i in {0..3} ;
do
if [ $i -eq 0 ]
then
nodes=8
name=all2
elif [ $i -eq 1 ]
then
nodes=24
name=all
elif [ $i -eq 3 ]
then
nodes=20
name=eight
else
continue
fi
for ((j=1;j<=nodes;j++));
do
echo "$name.q@compute-$i-$j.local"
echo "$(<$i-$j/vasp.err)"
done
done
printf "If the error file reads 'cannot remove tmp directory' the simulation was successful.
\nIf the error file reads 'cannot find vasp.err' the node is not functioning and you will need to do a hard reboot.
\nIf it cannot find the intel compliers you need to fix a missing opt/intel directory (see the UF FLAMES 'admin' guide)."
5. If any jobs failed check the error messages. If all nodes are working but you can't run the VASP test jobs you may need to copy the compilers and modulefiles to each node.