diff --git a/blast/parallel_BLAST/LA_Wrapper b/blast/parallel_BLAST/LA_Wrapper index c83844c44d7492c76a573f137c5f6a9479262de3..6f5d92443cd0d256eca7c212b576080ded874c57 100755 --- a/blast/parallel_BLAST/LA_Wrapper +++ b/blast/parallel_BLAST/LA_Wrapper @@ -43,15 +43,11 @@ module purge # load the most current version of GNU parallel module load tools/parallel -# load the most current version of BLAST + -#module load bio/BLAST+ # do not rely on most recent version -module load bio/BLAST+/2.9.0-foss-2018a - module load lang/Python/3.6.4-foss-2018a ### setup variable for THIS script; giving absolute path if necessary SCRIPT="$0" -SCRIPT_VERSION="0.4" +SCRIPT_VERSION="0.5" # TODO: delete the following 3 functions, once sbcast is working function queue { @@ -102,6 +98,10 @@ error() { (>&2 echo "ERROR: $1") } +warning() { + (>&2 echo "WARNING: $1") +} + # THE DEFAULTS INITIALIZATION - POSITIONALS _positionals=() _arg_leftovers=() @@ -112,10 +112,11 @@ _arg_queue=nodeshort _arg_assoc=$(sacct -nu $USER -o Account | tail -n1) declare -i _arg_nodes=1 _arg_reservation='' -declare _arg_mem=1G +declare _memory_request=115500M +declare _arg_mem=0 declare -i _arg_blast_threads=1 _arg_blast_params='' -declare -i _arg_splitup_per_queryfile=20 +declare -i _arg_splitup_per_queryfile=0 declare _arg_ramdisk=40G _arg_blastdir='.' _arg_executable='blastx' @@ -126,26 +127,26 @@ print_help () { echo "This script's help msg" printf 'Usage: %s [-l|--runlimit ] [-p|--partition ] [-s|--splitup ] [-N|--nodes ] [--executable ] [-m|--mem ] [--blastparams ] [-r|--ramdisk ] [--blastdir ] [--(no-)test] [-h|--help] \n' "$(basename $0)\n" - printf 'HINT: The FASTA and DATABASE items need to be full paths to files.' - printf "\t%s\n" ": path to the query FASTA file" - printf "\t%s\n" ": path to the database file" - printf "\t%s\n" "-l,--runlimit: runlimit default is 300 min, queue will be nodeshort, if <= 300 (default)" - printf "\t%s\n" "-p,--partition: queue (default is nodeshort)" - printf "\t%s\n" "-A,--account: queue (default is the last submit account; an error is triggered if none specified nor can be deduced)" - printf "\t%s\n" "-N,--nodes: number of nodes (1 is the default)" - printf "\t%s\n" "--reservation: reservation to use (none is the default)" - printf "\t%s\n" "--time: time in minutes (300 is the default)" - printf "\t%s\n" "-m,--mem: memory which is required per node (defaults to 115500 M, but should be min. 242500 M for blastn)" - printf "\t%s\n" "-r,--ramdisk: ramdisk size in units of GiB (default is 40 GiB)" - printf "\t%s\n" "-t,--threads: blast threads (default is 1)" - printf "\t%s\n" "--blastparams: blast parameters (default is -outfmt 5 (for xml output))" - printf "\t%s\n" "-s,--splitup: No. of FASTA sequences per query file (default is 20)" - printf "\t%s\n" "--blastdir: output directory (default is composition of input names)" - printf "\t%s\n" "--executable: choose executable (currently only from NCBI-BLAST, default: blastx)" - printf "\t%s\n" "--compress: if set, the output files will be merged and compressed (time consuming!, defaultt: off)" - printf "\t%s\n" "--test,--no-test: dry run, testing only (off by default)" - printf "\t%s\n" "--credits,--version: Prints credits and a brief version history and exits" - printf "\t%s\n" "-h,--help: Prints help" + printf 'HINT: The FASTA and DATABASE items need to be full paths to files.\n' + printf "\\t\\033[1m%s\\033[0m\\t\\t%s\\n" "" "path to the query FASTA file" + printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "" "path to the database file" + printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "-l,--runlimit" "runlimit default is 300 min, queue will be nodeshort, if <= 300 (default)" + printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "-p,--partition" "queue (default is nodeshort)" + printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "-A,--account" "SLURM account (default is the last submit account; an error is triggered if none specified nor can be deduced)" + printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "-N,--nodes" "number of nodes (1 is the default)" + printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "--reservation" "reservation to use (none is the default)" + printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "--time" "time in minutes (300 is the default)" + printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "-m,--mem" "memory which is required per node (defaults to 115500 M, but should be min. 242500 M for blastn, omit the unit for submitting)" + printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "-r,--ramdisk" "ramdisk size in units of GiB (default is 40 GiB)" + printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "-t,--threads" "blast threads (default is 1)" + printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "--blastparams" "blast parameters (default is -outfmt 6 (for blank tabulated output))" + printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "-s,--splitup" "No. of FASTA sequences per query file (default is 20)" + printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "--blastdir" "output directory (default is composition of input names)" + printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "--executable" "choose executable (currently only from NCBI-BLAST, default: blastx)" + printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "--compress" "if set, the output files will be merged and compressed (time consuming!, defaultt: off)" + printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "--test,--no-test" "dry run, testing only (off by default)" + printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "--credits,--version" "Prints credits and a brief version history and exits" + printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "-h,--help" "Prints help" echo echo -e "\e[3mWARNINGS:\e[0m" echo -e "\e[3m- BLAST parameters:\e[0m" @@ -200,6 +201,12 @@ credits() echo "- v0.3.2 -- 16. Jan. 2019 -- hot fix for new ramdisk and slurmstepd support" echo "- v0.4 -- 06. Mar. 2019 -- refactored version:" echo " - executables now pluggable" + echo "- v0.5 -- 17. Aug. 2019 -- fix: parser did not work for '--mem'-arg properly" + echo " update: - clearer UI" + echo " - better default memory settings" + echo " - faster stage-in for reference data" + echo " - automerge for -outfmt=6" + echo " - -outfmt=6 is now the default" echo echo "Current version is: $SCRIPT_VERSION" echo @@ -423,11 +430,16 @@ FASTA=$_arg_fasta DATABASE=$_arg_database ### check if query & database exist -if [ ! -e "$FASTA" ]; then +if [[ $_arg_test == "off" ]] && [ ! -e "$FASTA" ]; then error "FASTA input was: '$FASTA' - no such file!" exit 1 fi +if [[ $_arg_test == "off" ]] && [ ! -d "$DATABASE" ]; then + error "DATABASE input was: '$DATABASE' - no such directory!" + exit 1 +fi + #TODO: differentiate between blastn,x,p -- for now, all are equal if [ "blastx" = "${_arg_executable,,}" ]; then executable="blastx" @@ -471,9 +483,26 @@ DB=${DATABASEID%.*} JOBTAG="BLAST_${FA}_VS_${DB}" +# how many entries are there in the FASTA file? +echo "Checking input file" +nentries=$(grep '>' $FASTA | wc -l) +# we try to set the split number to a value, which ensures an output of +# ~ 10.000 split files +if [ $_arg_splitup_per_queryfile -ne 0 ]; then # the user thinks differently? + nsplits=$((nentries / _arg_splitup_per_queryfile)) + if [ $nsplits -gt 50000 ]; then + error "There would be more than '$nsplits' files in scratch." + exit 1 + elif [ $nsplits -gt 15000 ]; then + warning "There will be '$nsplits' files in scratch." + fi +else # infer the value + _arg_splitup_per_queryfile=$((nentries / 10000)) +fi + # default values, see: # https://www.ncbi.nlm.nih.gov/books/NBK279675/ -DEFAULT_BLASTPARAMS='-outfmt 5' +DEFAULT_BLASTPARAMS='-outfmt 6' # sanity check: '-outfmt' in blast parameters? if [[ "$_arg_blastparams" =~ "outfmt" ]]; then BLASTPARAMS=$_arg_blastparams @@ -483,8 +512,10 @@ fi # test whether the output is xml or not if [[ '-outfmt 5' =~ "$BLASTPARAMS" ]]; then XMLOUT=1 -else + OUTOUT=0 +elif [[ '-outfmt 6' =~ "$BLASTPARAMS" ]]; then XMLOUT=0 + OUTOUT=1 fi # TODO: port to M2 @@ -550,15 +581,52 @@ fi ### which is the reference directory size? _arg_ramdisk=$(du -shL --block-size=1M "$_arg_database" | cut -f1 )M - if [[ ! $SCRIPT == /* ]]; then SCRIPT="$PWD/$SCRIPT"; fi +# which cluster are we on? +cluster=$(sacctmgr show cluster -p| tail -n1| cut -f1 -d '|') +# if the cluster is Mogon I, set the memory default accordingly: +if [ "$cluster" == "mogon" ]; then + if [ $_arg_mem -ne 0 ]; then # user tries to select a non-default memory + allowed_mem_setting="115500 242500 497500" + if [[ ! $allowed_mem_settings =~ (^|[[:space:]])"_arg_mem"($|[[:space:]]) ]]; then + error "Memory selection out to be one of [$allowed_mem_settings]" + fi + else # set a default memory + if [ "$_arg_executable" == "blastn" ]; then + _memory_request="242500M" + else + _memory_request="115500M" + fi + fi +else + if [ $_arg_mem -ne 0 ]; then # user tries to select a non-default memory + allowed_mem_setting="115500 242500 497500" + if [[ ! $allowed_mem_settings =~ (^|[[:space:]])"_arg_mem"($|[[:space:]]) ]]; then + error "Memory selection out to be one of [$allowed_mem_settings]" + fi + else # set a default memory + if [ "$_arg_executable" == "blastn" ]; then + _memory_request="246000M" + else + _memory_request="120000M" + fi + fi +fi + +# how many entries are there in the FASTA file? +nentries=$(grep '>' $FASTA | wc -l) +# we try to set the split number to a value, which ensures an output of +# ~ 10.000 split files + + + ### check if this script is on node by checking env-variable $SLURM_JOB_ID, else send it to SLURM with given parameters and exit if [ -z "$SLURM_JOB_ID" ]; then export SCRIPT_PATH=$(dirname $0) - submit_statement="sbatch --no-requeue -o ${JOBTAG}_%j.out -J $JOBTAG -p $_arg_queue -A $_arg_assoc -t $_arg_runlimit -N $_arg_nodes -n $((64 * $_arg_nodes / $threads)) --mem=$_arg_mem --ramdisk=${_arg_ramdisk} -c $threads" + submit_statement="sbatch --no-requeue -o ${JOBTAG}_%j.out -J $JOBTAG -p $_arg_queue -A $_arg_assoc -t $_arg_runlimit -N $_arg_nodes -n $((64 * $_arg_nodes / $threads)) --mem=$_memory_request --ramdisk=${_arg_ramdisk} -c $threads" script_statement="$SCRIPT --partition $_arg_queue --account $_arg_assoc --nodes $_arg_nodes --time $_arg_runlimit --reservation=$_arg_reservation --threads $_arg_blast_threads --splitup $_arg_splitup_per_queryfile --blastparams=\"$BLASTPARAMS\" --executable=$_arg_executable $FASTA $DATABASE" if [ -n "$_arg_reservation" ]; then @@ -598,18 +666,35 @@ RAMDISK=$JOBDIR/ramdisk HOSTLIST=$(scontrol show hostname $SLURM_JOB_NODELIST | paste -d, -s | tr ',', ' ') QUEUE='' + for HOST in $HOSTLIST; do - # when copying dereference putative links! - eval "ssh $HOST cp -Lr $DATABASEPATH $RAMDISK/. &" - PID=$! - queue $PID - # outcommented because of bug in slurm 16.05, see TODO-item + if [ -L ${DATABASEPATH} ]; then + warning "If the reference directory is a link, fast stage-in is not possible." + for fname in ${DATABASEPATH}/*; do + eval "ssh $HOST cp -L $fname ${RAMDISK}/$(basename $fname)" & + PID=$! + queue $PID + done + else + for fname in ${DATABASEPATH}/*; do + if [ -L "$fname" ]; then + eval "ssh $HOST cp -L $fname ${RAMDISK}/$(basename $fname)" & + PID=$! + queue $PID + else + eval "ssh $HOST dd bs=4096 if=$fname of=${RAMDISK}/$(basename $fname)" & + PID=$! + queue $PID + fi + done + fi # TODO: check for dereferencing links, before enabling + # TODO: check for performance, before re-enabling #sbcast $FILE $RAMDISK/$(basename $FILE) done #DATABASE=$RAMDISK/$DATABASE -DATABASE=$RAMDISK/$(basename $DATABASEPATH) +DATABASE=$RAMDISK #/$(basename $DATABASEPATH) WORKDIR=$PWD/$BLASTDIR/$SLURM_JOB_NAME # this script may never output to a user's $HOME @@ -700,6 +785,24 @@ if [ $n_unfinished_files -eq 0 ] && [[ $_arg_compress == "on" ]] && [ $XMLOUT -e #rm ./output/group_*.xml & #rm -rf ./scratch & wait +elif [ $n_unfinished_files -eq 0 ] && [[ $_arg_compress == "on" ]] && [ $OUTOUT -eq 1 ]; then + # shrink the alloction, such that only the minimum necessary is accounted for + #scontrol update job=$SLURM_JOB_ID NumNodes=1 + pwd + # merge all xml files + STARTC=$(date +%s.%N) + outfile="${JOBTAG}.out" + # select the first of all files + some_file=$(find ./output -name 'group*' | head -n1) + # write anything to the output file + for split_file in ./output/group_*gz; do + zcat $split_file >> $outfile + done + pigz -p 16 $outfile & + ENDC=$(date +%s.%N) + elapsedc=$(bc <<< "scale=1; (($ENDC-$STARTC))/60") + rm -rf $WORKDIR/$SPLITFILEDIR & + wait fi # marks the end of this run