Commit f9b1e397 authored by Christian Meesters's avatar Christian Meesters

- outmerging for out files

- improved UI
- minor debugging
parent 5ed2df07
...@@ -43,15 +43,11 @@ module purge ...@@ -43,15 +43,11 @@ module purge
# load the most current version of GNU parallel # load the most current version of GNU parallel
module load tools/parallel module load tools/parallel
# load the most current version of BLAST +
#module load bio/BLAST+ # do not rely on most recent version
module load bio/BLAST+/2.9.0-foss-2018a
module load lang/Python/3.6.4-foss-2018a module load lang/Python/3.6.4-foss-2018a
### setup variable for THIS script; giving absolute path if necessary ### setup variable for THIS script; giving absolute path if necessary
SCRIPT="$0" SCRIPT="$0"
SCRIPT_VERSION="0.4" SCRIPT_VERSION="0.5"
# TODO: delete the following 3 functions, once sbcast is working # TODO: delete the following 3 functions, once sbcast is working
function queue { function queue {
...@@ -102,6 +98,10 @@ error() { ...@@ -102,6 +98,10 @@ error() {
(>&2 echo "ERROR: $1") (>&2 echo "ERROR: $1")
} }
warning() {
(>&2 echo "WARNING: $1")
}
# THE DEFAULTS INITIALIZATION - POSITIONALS # THE DEFAULTS INITIALIZATION - POSITIONALS
_positionals=() _positionals=()
_arg_leftovers=() _arg_leftovers=()
...@@ -112,10 +112,11 @@ _arg_queue=nodeshort ...@@ -112,10 +112,11 @@ _arg_queue=nodeshort
_arg_assoc=$(sacct -nu $USER -o Account | tail -n1) _arg_assoc=$(sacct -nu $USER -o Account | tail -n1)
declare -i _arg_nodes=1 declare -i _arg_nodes=1
_arg_reservation='' _arg_reservation=''
declare _arg_mem=1G declare _memory_request=115500M
declare _arg_mem=0
declare -i _arg_blast_threads=1 declare -i _arg_blast_threads=1
_arg_blast_params='' _arg_blast_params=''
declare -i _arg_splitup_per_queryfile=20 declare -i _arg_splitup_per_queryfile=0
declare _arg_ramdisk=40G declare _arg_ramdisk=40G
_arg_blastdir='.' _arg_blastdir='.'
_arg_executable='blastx' _arg_executable='blastx'
...@@ -126,26 +127,26 @@ print_help () ...@@ -126,26 +127,26 @@ print_help ()
{ {
echo "This script's help msg" echo "This script's help msg"
printf 'Usage: %s [-l|--runlimit <arg>] [-p|--partition <arg>] [-s|--splitup <arg>] [-N|--nodes <arg>] [--executable <arg>] [-m|--mem <arg>] [--blastparams <string>] [-r|--ramdisk <arg>] [--blastdir <arg>] [--(no-)test] [-h|--help] <FASTA> <DATABASE>\n' "$(basename $0)\n" printf 'Usage: %s [-l|--runlimit <arg>] [-p|--partition <arg>] [-s|--splitup <arg>] [-N|--nodes <arg>] [--executable <arg>] [-m|--mem <arg>] [--blastparams <string>] [-r|--ramdisk <arg>] [--blastdir <arg>] [--(no-)test] [-h|--help] <FASTA> <DATABASE>\n' "$(basename $0)\n"
printf 'HINT: The FASTA and DATABASE items need to be full paths to files.' printf 'HINT: The FASTA and DATABASE items need to be full paths to files.\n'
printf "\t%s\n" "<FASTA>: path to the query FASTA file" printf "\\t\\033[1m%s\\033[0m\\t\\t%s\\n" "<FASTA>" "path to the query FASTA file"
printf "\t%s\n" "<DATABASE>: path to the database file" printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "<DATABASE>" "path to the database file"
printf "\t%s\n" "-l,--runlimit: runlimit default is 300 min, queue will be nodeshort, if <= 300 (default)" printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "-l,--runlimit" "runlimit default is 300 min, queue will be nodeshort, if <= 300 (default)"
printf "\t%s\n" "-p,--partition: queue (default is nodeshort)" printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "-p,--partition" "queue (default is nodeshort)"
printf "\t%s\n" "-A,--account: queue (default is the last submit account; an error is triggered if none specified nor can be deduced)" printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "-A,--account" "SLURM account (default is the last submit account; an error is triggered if none specified nor can be deduced)"
printf "\t%s\n" "-N,--nodes: number of nodes (1 is the default)" printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "-N,--nodes" "number of nodes (1 is the default)"
printf "\t%s\n" "--reservation: reservation to use (none is the default)" printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "--reservation" "reservation to use (none is the default)"
printf "\t%s\n" "--time: time in minutes (300 is the default)" printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "--time" "time in minutes (300 is the default)"
printf "\t%s\n" "-m,--mem: memory which is required per node (defaults to 115500 M, but should be min. 242500 M for blastn)" printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "-m,--mem" "memory which is required per node (defaults to 115500 M, but should be min. 242500 M for blastn, omit the unit for submitting)"
printf "\t%s\n" "-r,--ramdisk: ramdisk size in units of GiB (default is 40 GiB)" printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "-r,--ramdisk" "ramdisk size in units of GiB (default is 40 GiB)"
printf "\t%s\n" "-t,--threads: blast threads (default is 1)" printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "-t,--threads" "blast threads (default is 1)"
printf "\t%s\n" "--blastparams: blast parameters (default is -outfmt 5 (for xml output))" printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "--blastparams" "blast parameters (default is -outfmt 6 (for blank tabulated output))"
printf "\t%s\n" "-s,--splitup: No. of FASTA sequences per query file (default is 20)" printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "-s,--splitup" "No. of FASTA sequences per query file (default is 20)"
printf "\t%s\n" "--blastdir: output directory (default is composition of input names)" printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "--blastdir" "output directory (default is composition of input names)"
printf "\t%s\n" "--executable: choose executable (currently only from NCBI-BLAST, default: blastx)" printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "--executable" "choose executable (currently only from NCBI-BLAST, default: blastx)"
printf "\t%s\n" "--compress: if set, the output files will be merged and compressed (time consuming!, defaultt: off)" printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "--compress" "if set, the output files will be merged and compressed (time consuming!, defaultt: off)"
printf "\t%s\n" "--test,--no-test: dry run, testing only (off by default)" printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "--test,--no-test" "dry run, testing only (off by default)"
printf "\t%s\n" "--credits,--version: Prints credits and a brief version history and exits" printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "--credits,--version" "Prints credits and a brief version history and exits"
printf "\t%s\n" "-h,--help: Prints help" printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "-h,--help" "Prints help"
echo echo
echo -e "\e[3mWARNINGS:\e[0m" echo -e "\e[3mWARNINGS:\e[0m"
echo -e "\e[3m- BLAST parameters:\e[0m" echo -e "\e[3m- BLAST parameters:\e[0m"
...@@ -200,6 +201,12 @@ credits() ...@@ -200,6 +201,12 @@ credits()
echo "- v0.3.2 -- 16. Jan. 2019 -- hot fix for new ramdisk and slurmstepd support" echo "- v0.3.2 -- 16. Jan. 2019 -- hot fix for new ramdisk and slurmstepd support"
echo "- v0.4 -- 06. Mar. 2019 -- refactored version:" echo "- v0.4 -- 06. Mar. 2019 -- refactored version:"
echo " - executables now pluggable" echo " - executables now pluggable"
echo "- v0.5 -- 17. Aug. 2019 -- fix: parser did not work for '--mem'-arg properly"
echo " update: - clearer UI"
echo " - better default memory settings"
echo " - faster stage-in for reference data"
echo " - automerge for -outfmt=6"
echo " - -outfmt=6 is now the default"
echo echo
echo "Current version is: $SCRIPT_VERSION" echo "Current version is: $SCRIPT_VERSION"
echo echo
...@@ -423,11 +430,16 @@ FASTA=$_arg_fasta ...@@ -423,11 +430,16 @@ FASTA=$_arg_fasta
DATABASE=$_arg_database DATABASE=$_arg_database
### check if query & database exist ### check if query & database exist
if [ ! -e "$FASTA" ]; then if [[ $_arg_test == "off" ]] && [ ! -e "$FASTA" ]; then
error "FASTA input was: '$FASTA' - no such file!" error "FASTA input was: '$FASTA' - no such file!"
exit 1 exit 1
fi fi
if [[ $_arg_test == "off" ]] && [ ! -d "$DATABASE" ]; then
error "DATABASE input was: '$DATABASE' - no such directory!"
exit 1
fi
#TODO: differentiate between blastn,x,p -- for now, all are equal #TODO: differentiate between blastn,x,p -- for now, all are equal
if [ "blastx" = "${_arg_executable,,}" ]; then if [ "blastx" = "${_arg_executable,,}" ]; then
executable="blastx" executable="blastx"
...@@ -471,9 +483,26 @@ DB=${DATABASEID%.*} ...@@ -471,9 +483,26 @@ DB=${DATABASEID%.*}
JOBTAG="BLAST_${FA}_VS_${DB}" JOBTAG="BLAST_${FA}_VS_${DB}"
# how many entries are there in the FASTA file?
echo "Checking input file"
nentries=$(grep '>' $FASTA | wc -l)
# we try to set the split number to a value, which ensures an output of
# ~ 10.000 split files
if [ $_arg_splitup_per_queryfile -ne 0 ]; then # the user thinks differently?
nsplits=$((nentries / _arg_splitup_per_queryfile))
if [ $nsplits -gt 50000 ]; then
error "There would be more than '$nsplits' files in scratch."
exit 1
elif [ $nsplits -gt 15000 ]; then
warning "There will be '$nsplits' files in scratch."
fi
else # infer the value
_arg_splitup_per_queryfile=$((nentries / 10000))
fi
# default values, see: # default values, see:
# https://www.ncbi.nlm.nih.gov/books/NBK279675/ # https://www.ncbi.nlm.nih.gov/books/NBK279675/
DEFAULT_BLASTPARAMS='-outfmt 5' DEFAULT_BLASTPARAMS='-outfmt 6'
# sanity check: '-outfmt' in blast parameters? # sanity check: '-outfmt' in blast parameters?
if [[ "$_arg_blastparams" =~ "outfmt" ]]; then if [[ "$_arg_blastparams" =~ "outfmt" ]]; then
BLASTPARAMS=$_arg_blastparams BLASTPARAMS=$_arg_blastparams
...@@ -483,8 +512,10 @@ fi ...@@ -483,8 +512,10 @@ fi
# test whether the output is xml or not # test whether the output is xml or not
if [[ '-outfmt 5' =~ "$BLASTPARAMS" ]]; then if [[ '-outfmt 5' =~ "$BLASTPARAMS" ]]; then
XMLOUT=1 XMLOUT=1
else OUTOUT=0
elif [[ '-outfmt 6' =~ "$BLASTPARAMS" ]]; then
XMLOUT=0 XMLOUT=0
OUTOUT=1
fi fi
# TODO: port to M2 # TODO: port to M2
...@@ -550,15 +581,52 @@ fi ...@@ -550,15 +581,52 @@ fi
### which is the reference directory size? ### which is the reference directory size?
_arg_ramdisk=$(du -shL --block-size=1M "$_arg_database" | cut -f1 )M _arg_ramdisk=$(du -shL --block-size=1M "$_arg_database" | cut -f1 )M
if [[ ! $SCRIPT == /* ]]; then if [[ ! $SCRIPT == /* ]]; then
SCRIPT="$PWD/$SCRIPT"; SCRIPT="$PWD/$SCRIPT";
fi fi
# which cluster are we on?
cluster=$(sacctmgr show cluster -p| tail -n1| cut -f1 -d '|')
# if the cluster is Mogon I, set the memory default accordingly:
if [ "$cluster" == "mogon" ]; then
if [ $_arg_mem -ne 0 ]; then # user tries to select a non-default memory
allowed_mem_setting="115500 242500 497500"
if [[ ! $allowed_mem_settings =~ (^|[[:space:]])"_arg_mem"($|[[:space:]]) ]]; then
error "Memory selection out to be one of [$allowed_mem_settings]"
fi
else # set a default memory
if [ "$_arg_executable" == "blastn" ]; then
_memory_request="242500M"
else
_memory_request="115500M"
fi
fi
else
if [ $_arg_mem -ne 0 ]; then # user tries to select a non-default memory
allowed_mem_setting="115500 242500 497500"
if [[ ! $allowed_mem_settings =~ (^|[[:space:]])"_arg_mem"($|[[:space:]]) ]]; then
error "Memory selection out to be one of [$allowed_mem_settings]"
fi
else # set a default memory
if [ "$_arg_executable" == "blastn" ]; then
_memory_request="246000M"
else
_memory_request="120000M"
fi
fi
fi
# how many entries are there in the FASTA file?
nentries=$(grep '>' $FASTA | wc -l)
# we try to set the split number to a value, which ensures an output of
# ~ 10.000 split files
### check if this script is on node by checking env-variable $SLURM_JOB_ID, else send it to SLURM with given parameters and exit ### check if this script is on node by checking env-variable $SLURM_JOB_ID, else send it to SLURM with given parameters and exit
if [ -z "$SLURM_JOB_ID" ]; then if [ -z "$SLURM_JOB_ID" ]; then
export SCRIPT_PATH=$(dirname $0) export SCRIPT_PATH=$(dirname $0)
submit_statement="sbatch --no-requeue -o ${JOBTAG}_%j.out -J $JOBTAG -p $_arg_queue -A $_arg_assoc -t $_arg_runlimit -N $_arg_nodes -n $((64 * $_arg_nodes / $threads)) --mem=$_arg_mem --ramdisk=${_arg_ramdisk} -c $threads" submit_statement="sbatch --no-requeue -o ${JOBTAG}_%j.out -J $JOBTAG -p $_arg_queue -A $_arg_assoc -t $_arg_runlimit -N $_arg_nodes -n $((64 * $_arg_nodes / $threads)) --mem=$_memory_request --ramdisk=${_arg_ramdisk} -c $threads"
script_statement="$SCRIPT --partition $_arg_queue --account $_arg_assoc --nodes $_arg_nodes --time $_arg_runlimit --reservation=$_arg_reservation --threads $_arg_blast_threads --splitup $_arg_splitup_per_queryfile --blastparams=\"$BLASTPARAMS\" --executable=$_arg_executable $FASTA $DATABASE" script_statement="$SCRIPT --partition $_arg_queue --account $_arg_assoc --nodes $_arg_nodes --time $_arg_runlimit --reservation=$_arg_reservation --threads $_arg_blast_threads --splitup $_arg_splitup_per_queryfile --blastparams=\"$BLASTPARAMS\" --executable=$_arg_executable $FASTA $DATABASE"
if [ -n "$_arg_reservation" ]; then if [ -n "$_arg_reservation" ]; then
...@@ -598,18 +666,35 @@ RAMDISK=$JOBDIR/ramdisk ...@@ -598,18 +666,35 @@ RAMDISK=$JOBDIR/ramdisk
HOSTLIST=$(scontrol show hostname $SLURM_JOB_NODELIST | paste -d, -s | tr ',', ' ') HOSTLIST=$(scontrol show hostname $SLURM_JOB_NODELIST | paste -d, -s | tr ',', ' ')
QUEUE='' QUEUE=''
for HOST in $HOSTLIST; do for HOST in $HOSTLIST; do
# when copying dereference putative links! if [ -L ${DATABASEPATH} ]; then
eval "ssh $HOST cp -Lr $DATABASEPATH $RAMDISK/. &" warning "If the reference directory is a link, fast stage-in is not possible."
PID=$! for fname in ${DATABASEPATH}/*; do
queue $PID eval "ssh $HOST cp -L $fname ${RAMDISK}/$(basename $fname)" &
# outcommented because of bug in slurm 16.05, see TODO-item PID=$!
queue $PID
done
else
for fname in ${DATABASEPATH}/*; do
if [ -L "$fname" ]; then
eval "ssh $HOST cp -L $fname ${RAMDISK}/$(basename $fname)" &
PID=$!
queue $PID
else
eval "ssh $HOST dd bs=4096 if=$fname of=${RAMDISK}/$(basename $fname)" &
PID=$!
queue $PID
fi
done
fi
# TODO: check for dereferencing links, before enabling # TODO: check for dereferencing links, before enabling
# TODO: check for performance, before re-enabling
#sbcast $FILE $RAMDISK/$(basename $FILE) #sbcast $FILE $RAMDISK/$(basename $FILE)
done done
#DATABASE=$RAMDISK/$DATABASE #DATABASE=$RAMDISK/$DATABASE
DATABASE=$RAMDISK/$(basename $DATABASEPATH) DATABASE=$RAMDISK #/$(basename $DATABASEPATH)
WORKDIR=$PWD/$BLASTDIR/$SLURM_JOB_NAME WORKDIR=$PWD/$BLASTDIR/$SLURM_JOB_NAME
# this script may never output to a user's $HOME # this script may never output to a user's $HOME
...@@ -700,6 +785,24 @@ if [ $n_unfinished_files -eq 0 ] && [[ $_arg_compress == "on" ]] && [ $XMLOUT -e ...@@ -700,6 +785,24 @@ if [ $n_unfinished_files -eq 0 ] && [[ $_arg_compress == "on" ]] && [ $XMLOUT -e
#rm ./output/group_*.xml & #rm ./output/group_*.xml &
#rm -rf ./scratch & #rm -rf ./scratch &
wait wait
elif [ $n_unfinished_files -eq 0 ] && [[ $_arg_compress == "on" ]] && [ $OUTOUT -eq 1 ]; then
# shrink the alloction, such that only the minimum necessary is accounted for
#scontrol update job=$SLURM_JOB_ID NumNodes=1
pwd
# merge all xml files
STARTC=$(date +%s.%N)
outfile="${JOBTAG}.out"
# select the first of all files
some_file=$(find ./output -name 'group*' | head -n1)
# write anything to the output file
for split_file in ./output/group_*gz; do
zcat $split_file >> $outfile
done
pigz -p 16 $outfile &
ENDC=$(date +%s.%N)
elapsedc=$(bc <<< "scale=1; (($ENDC-$STARTC))/60")
rm -rf $WORKDIR/$SPLITFILEDIR &
wait
fi fi
# marks the end of this run # marks the end of this run
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment