#!/bin/bash # Copyright (c) -2014, Christoph Martin, JGU Mainz # 2014-2019, Christian Meesters, JGU Mainz # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # * Neither the name of Christian Meesters or the JGU Mainz nor the names of # its contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL CHRISTIAN MEESTERS OR THE JGU MAINZ BE LIABLE # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH # DAMAGE. #set -x set -e PS4='Line ${LINENO}: ' # to measure the excecution time independent of SLURM START=$(date +%s.%N) # purging the module list, because interference with other modules # should be avoided module purge # load the most current version of GNU parallel module load tools/parallel module load lang/Python/3.6.4-foss-2018a ### setup variable for THIS script; giving absolute path if necessary SCRIPT="$0" SCRIPT_VERSION="0.5" # TODO: delete the following 3 functions, once sbcast is working function queue { QUEUE="$QUEUE $1" NUM=$((NUM+1)) } function regeneratequeue { OLDREQUEUE=$QUEUE QUEUE="" NUM=0 for PID in $OLDREQUEUE; do if [ -d /proc/$PID ]; then QUEUE="$QUEUE $PID" NUM=$((NUM+1)) fi done } function checkqueue { OLDCHQUEUE=$QUEUE for PID in $OLDCHQUEUE; do if [ ! -d /proc/$PID ]; then regeneratequeue # at least one PID has finished fi done } die() { local _ret=$2 test -n "$_ret" || _ret=1 test "$_PRINT_HELP" = yes && print_help >&2 echo "$1" >&2 exit ${_ret} } begins_with_short_option() { local first_option all_short_options all_short_options='lqscmrh' first_option="${1:0:1}" test "$all_short_options" = "${all_short_options/$first_option/}" && return 1 || return 0 } # function to redirect simple error messages to stderr error() { (>&2 echo "ERROR: $1") } warning() { (>&2 echo "WARNING: $1") } # THE DEFAULTS INITIALIZATION - POSITIONALS _positionals=() _arg_leftovers=() # THE DEFAULTS INITIALIZATION - OPTIONALS declare -i _arg_runlimit=300 _arg_queue=nodeshort # TODO: ask for account, if string is NULL _arg_assoc=$(sacct -nu $USER -o Account | tail -n1) declare -i _arg_nodes=1 _arg_reservation='' declare _memory_request=115500M declare _arg_mem=0 declare -i _arg_blast_threads=1 _arg_blast_params='' declare -i _arg_splitup_per_queryfile=0 declare _arg_ramdisk=40G _arg_blastdir='.' _arg_executable='blastx' _arg_test=off _arg_compress=on print_help () { echo "This script's help msg" printf 'Usage: %s [-l|--runlimit ] [-p|--partition ] [-s|--splitup ] [-N|--nodes ] [--executable ] [-m|--mem ] [--blastparams ] [-r|--ramdisk ] [--blastdir ] [--(no-)test] [-h|--help] \n' "$(basename $0)\n" printf 'HINT: The FASTA and DATABASE items need to be full paths to files.\n' printf "\\t\\033[1m%s\\033[0m\\t\\t%s\\n" "" "path to the query FASTA file" printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "" "path to the database file" printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "-l,--runlimit" "runlimit default is 300 min, queue will be nodeshort, if <= 300 (default)" printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "-p,--partition" "queue (default is nodeshort)" printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "-A,--account" "SLURM account (default is the last submit account; an error is triggered if none specified nor can be deduced)" printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "-N,--nodes" "number of nodes (1 is the default)" printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "--reservation" "reservation to use (none is the default)" printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "--time" "time in minutes (300 is the default)" printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "-m,--mem" "memory which is required per node (defaults to 115500 M, but should be min. 242500 M for blastn, omit the unit for submitting)" printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "-r,--ramdisk" "ramdisk size in units of GiB (default is 40 GiB)" printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "-t,--threads" "blast threads (default is 1)" printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "--blastparams" "blast parameters (default is -outfmt 6 (for blank tabulated output))" printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "-s,--splitup" "No. of FASTA sequences per query file (default is to generate ~5000 files)" printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "--blastdir" "output directory (default is composition of input names)" printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "--executable" "choose executable (currently only from NCBI-BLAST, default: blastx)" printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "--compress" "if set, the output files will be merged and compressed (time consuming!, defaultt: off)" printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "--test,--no-test" "dry run, testing only (off by default)" printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "--credits,--version" "Prints credits and a brief version history and exits" printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "-h,--help" "Prints help" echo echo -e "\e[3mWARNINGS:\e[0m" echo -e "\e[3m- BLAST parameters:\e[0m" echo " There are no BLAST default parameters, other than specifing XML output, the database and" echo " queries as well as the numer of threads (all set by this script)." echo " Detailed overview on parameters can be shown by entering -help" echo -e "\e[3m- Limitations:\e[0m" echo " - Currently only NCBI-BLAST+ executables are supported." echo " - References may be as big as 30G, 50G may also work." echo " References beyond this this may or not work." echo " Be sure to reserve sufficient RAM." echo echo "Planned features for upcoming versions:" echo "- better, more stable user interface" echo "- support for alternate implementations, e.g. BLAT, diamond" echo "- automatic merging of xml output" echo "- restart capability to resume work in case of timelimits." echo "- beeond support for handling greater references" echo "- automatic reference database generation in case of version mismatches or similar" echo echo "Minor improvements will be implemented as time permits. Yet, to make this possible," echo -e "\e[1myou can request a collaboration\e[0m (for minor add ons an acknowladegement by name will do)." } credits() { echo "The original implementation (2013/2014) was written by Christoph Martin (ZDV, UNIX group)" echo "Benjamin Rieger (Institut für Molekulargenetik) contributed a perl implementation" echo "of a format conform splitting of FASTA files, which is not used, anymore." echo "The original implemenation was a LSF chain job. It was eventually adopted and maintained by" echo "Christian Meesters (ZDV, HPC group) from 2017 onwards." echo echo "I am particularly grateful for their feedback to:" echo "- Lukas Hellman (AG Hankeln)" echo "- Benjamin Rieger (NGS Facility)" echo echo "History of the re-implementation:" echo "- v0.1 -- 27. Sep. 2017 -- release of the re-implementation for SLURM supporting the" echo " ability to compute accross nodes." echo "- v0.1.1 -- 19. Oct. 2017 -- bug fix: blast parameters now correctly transferred to" echo " blast(x)." echo "- v0.1.2 -- 25. Oct. 2017 -- minor fixes and clarifications" echo "- v0.2.b -- 18. Jan. 2018 -- new features:" echo " - possible to choose between NCBI-BLAST executables" echo " - better user feedback / self documentation \(at least a start\)" echo " - error messages are now directed to stderr" echo " - jobs may use reservations (e.g. for courses)" echo " changes:" echo " - deleted default evalue and max_target_seqs settings" echo " - XML output is mandatory" echo " fix:" echo " - introduced missing --time option" echo " - changed the blast parameter interface" echo "- v0.3 -- 21. Feb. 2018 -- faster merging of output files, parallel zipping" echo "- v0.3.1 -- 15. May 2018 -- bugfixes in handling the blast executables" echo "- v0.3.2 -- 16. Jan. 2019 -- hot fix for new ramdisk and slurmstepd support" echo "- v0.4 -- 06. Mar. 2019 -- refactored version:" echo " - executables now pluggable" echo "- v0.5 -- 21. Aug. 2019 -- fix: parser did not work for '--mem'-arg properly" echo " update: - clearer UI" echo " - better default memory settings" echo " - faster stage-in for reference data" echo " - automerge for -outfmt=6" echo " - -outfmt=6 is now the default" echo echo "Current version is: $SCRIPT_VERSION" echo echo "The re-implementation supporting parallel BLAST+ jobs accross several compute nodes has been" echo "written and is maintained by Christian Meesters (ZDV, HPC group)." echo "A (personal) acknowledgement is welcomed. Please refer to:" echo "https://hpc.uni-mainz.de/high-performance-computing/publikationen/" exit } # command line parsing while test $# -gt 0 do _key="$1" case "$_key" in -l*|--runlimit|--runlimit=*) _val_from_long="${_key##--runlimit=}" _val_from_short="${_key##-l}" if test "$_val_from_long" != "$_key" then _val="$_val_from_long" elif test "$_val_from_short" != "$_key" -a -n "$_val_from_short" then _val="$_val_from_short" else test $# -lt 2 && die "Missing value for the optional argument '$_key'." 1 _val="$2" shift fi _arg_runlimit="$_val" ;; -p*|--partition|--partition=*) _val_from_long="${_key##--partition=}" _val_from_short="${_key##-p}" if test "$_val_from_long" != "$_key" then _val="$_val_from_long" elif test "$_val_from_short" != "$_key" -a -n "$_val_from_short" then _val="$_val_from_short" else test $# -lt 2 && die "Missing value for the optional argument '$_key'." 1 _val="$2" shift fi _arg_queue="$_val" ;; -A*|--account|--account=*) _val_from_long="${_key##--account=}" _val_from_short="${_key##-A}" if test "$_val_from_long" != "$_key" then _val="$_val_from_long" elif test "$_val_from_short" != "$_key" -a -n "$_val_from_short" then _val="$_val_from_short" else test $# -lt 2 && die "Missing value for the optional argument '$_key'." 1 _val="$2" shift fi _arg_assoc="$_val" ;; -N*|--nodes|--nodes=*) _val_from_long="${_key##--nodes=}" _val_from_short="${_key##-N}" if test "$_val_from_long" != "$_key" then _val="$_val_from_long" elif test "$_val_from_short" != "$_key" -a -n "$_val_from_short" then _val="$_val_from_short" else test $# -lt 2 && die "Missing value for the optional argument '$_key'." 1 _val="$2" shift fi _arg_nodes="$_val" ;; --reservation|--reservation=*) _val="${_key##--reservation=}" if test "$_val" = "$_key" then test $# -lt 2 && die "Missing value for the optional argument '$_key'." 1 _val="$2" shift fi _arg_reservation="$_val" ;; --time|--time=*) _val_from_long="${_key##--time=}" _val_from_short="${_key##-N}" if test "$_val_from_long" != "$_key" then _val="$_val_from_long" elif test "$_val_from_short" != "$_key" -a -n "$_val_from_short" then _val="$_val_from_short" else test $# -lt 2 && die "Missing value for the optional argument '$_key'." 1 _val="$2" shift fi _arg_runlimit="$_val" ;; -m*|--mem|--mem=*) _val_from_long="${_key##--mem=}" if test "$_val_from_long" != "$_key" then _val="$_val_from_long" else test $# -lt 2 && die "Missing value for the optional argument '$_key'." 1 _val="$2" shift fi _arg_mem="$_val" ;; --blastparams|--blastparams=*) _val_from_long="${_key##--blastparams=}" if test "$_val_from_long" != "$_key" then _val="$_val_from_long" else test $# -lt 2 && die "Missing value for the optional argument '$_key'." 1 _val="$2" shift fi _arg_blastparams="$_val" ;; -t*|--threads|--threads=*) _val_from_long="${_key##--threads=}" _val_from_short="${_key##-t}" if test "$_val_from_long" != "$_key" then _val="$_val_from_long" elif test "$_val_from_short" != "$_key" -a -n "$_val_from_short" then _val="$_val_from_short" else test $# -lt 2 && die "Missing value for the optional argument '$_key'." 1 _val="$2" shift fi _arg_blast_threads="$_val" ;; -s*|--splitup|--splitup=*) _val_from_long="${_key##--splitup=}" _val_from_short="${_key##-s}" if test "$_val_from_long" != "$_key" then _val="$_val_from_long" elif test "$_val_from_short" != "$_key" -a -n "$_val_from_short" then _val="$_val_from_short" else test $# -lt 2 && die "Missing value for the optional argument '$_key'." 1 _val="$2" shift fi _arg_splitup_per_queryfile="$_val" ;; --blastdir|--blastdir=*) _val="${_key##--blastdir=}" if test "$_val" = "$_key" then test $# -lt 2 && die "Missing value for the optional argument '$_key'." 1 _val="$2" shift fi _arg_blastdir="$_val" ;; --executable|--executable=*) _val="${_key##--executable=}" if test "$_val" = "$_key" then test $# -lt 2 && die "Missing value for the optional argument '$_key'." 1 _val="$2" shift fi _arg_executable="$_val" ;; --no-compress|--compress) _arg_compress="on" test "${1:0:5}" = "--no-" && _arg_compress="off" ;; --no-test|--test) _arg_test="on" test "${1:0:5}" = "--no-" && _arg_test="off" ;; --credits|--version) credits exit 0 ;; -h*|--help) print_help exit 0 ;; *) _positionals+=("$1") ;; esac shift done _positional_names=('_arg_fasta' '_arg_database' ) _required_args_string="'FASTA' and 'DATABASE'" #test ${#_positionals[@]} -lt 2 && _PRINT_HELP=yes die "FATAL ERROR: Not enough positional arguments - we require exactly 2 (namely: $_required_args_string), but got only ${#_positionals[@]}." 1 #test ${#_positionals[@]} -gt 2 && _PRINT_HELP=yes die "FATAL ERROR: There were spurious positional arguments --- we expect exactly 2 (namely: $_required_args_string), but got ${#_positionals[@]} (the last one was: '${_positionals[*]: -1}')." 1 #for (( ii = 0; ii < ${#_positionals[@]}; ii++)) for (( ii = 0; ii < 2; ii++)) do eval "${_positional_names[ii]}=\${_positionals[ii]}" || die "Error during argument parsing, possibly an Argbash bug." 1 done _our_args=$((${#_positionals[@]} - ${#_positional_names[@]})) for (( ii = 0; ii < _our_args; ii++)); do _positional_names+=("_arg_leftovers[(($ii + 1))]") done ### get query and database files FASTA=$_arg_fasta DATABASE=$_arg_database ### check if query & database exist if [[ $_arg_test == "off" ]] && [ ! -e "$FASTA" ]; then error "FASTA input was: '$FASTA' - no such file!" exit 1 fi if [[ $_arg_test == "off" ]] && [ ! -d "$DATABASE" ]; then error "DATABASE input was: '$DATABASE' - no such directory!" exit 1 fi #TODO: differentiate between blastn,x,p -- for now, all are equal if [ "blastx" = "${_arg_executable,,}" ]; then executable="blastx" threads=2 if [ -z "$SLURM_JOB_ID" ]; then source $(dirname "$0")/blast_wrap.sh else source "${SCRIPT_PATH}"/blast_wrap.sh fi elif [ "blastn" = "${_arg_executable,,}" ]; then executable="blastn" threads=8 if [ -z "$SLURM_JOB_ID" ]; then source $(dirname "$0")/blast_wrap.sh else source "${SCRIPT_PATH}"/blast_wrap.sh fi elif [ "blastp" = "${_arg_executable,,}" ]; then executable="blastp" threads=2 if [ -z "$SLURM_JOB_ID" ]; then source $(dirname "$0")/blast_wrap.sh else source "${SCRIPT_PATH}"/blast_wrap.sh fi else error "executable '$_arg_executable' not recognized." print_help exit 2 fi ### prepare filepath and -names for creating working folder FASTAPATH=$(dirname $FASTA) DATABASEPATH=$(realpath $DATABASE) FASTAID=$(basename $FASTA) DATABASEID=$(basename $DATABASE) FA=${FASTAID%%.*} DB=${DATABASEID%.*} JOBTAG="BLAST_${FA}_VS_${DB}" # how many entries are there in the FASTA file? echo "Checking input file" nentries=$(grep '>' $FASTA | wc -l) # we try to set the split number to a value, which ensures an output of # ~ 10.000 split files if [ $_arg_splitup_per_queryfile -ne 0 ]; then # the user thinks differently? nsplits=$((nentries / _arg_splitup_per_queryfile)) if [ $nsplits -gt 50000 ]; then error "There would be more than '$nsplits' files in scratch." exit 1 elif [ $nsplits -gt 15000 ]; then warning "There will be '$nsplits' files in scratch -- resulting in poor performance." fi else # infer the value _arg_splitup_per_queryfile=$((nentries / 5000)) fi # default values, see: # https://www.ncbi.nlm.nih.gov/books/NBK279675/ DEFAULT_BLASTPARAMS='-outfmt 6' # sanity check: '-outfmt' in blast parameters? if [[ "$_arg_blastparams" =~ "outfmt" ]]; then BLASTPARAMS=$_arg_blastparams else BLASTPARAMS="${_arg_blastparams} $DEFAULT_BLASTPARAMS" fi # test whether the output is xml or not if [[ '-outfmt 5' =~ "$BLASTPARAMS" ]]; then XMLOUT=1 OUTOUT=0 elif [[ '-outfmt 6' =~ "$BLASTPARAMS" ]]; then XMLOUT=0 OUTOUT=1 fi # TODO: port to M2 ### Auto-Adjust the Queue if [ $_arg_runlimit -le 300 ]; then if [ "$_arg_queue" != "nodeshort" ]; then _arg_queue="${_arg_queue/long/short}" fi else if [ "$_arg_queue" != "nodeslong" ]; then _arg_queue="${_arg_queue/short/long}" fi fi ### check if working directory already exists if [ -z "$SLURM_JOB_ID" ] && [[ $_arg_test == "off" ]]; then if [ -d "$JOBTAG" ]; then echo "$JOBTAG : directory already exists! Remove? (y/[n])" echo -n '>' read ENTER if [[ ${ENTER,,} = 'y' || ${ENTER,,} == 'yes' ]]; then echo "removing directory $JOBTAG" rm -r $JOBTAG else echo "So you want to continue regardless? (e.g. scratch files already existing) ([y]/n)" echo -n '>' read ENTER if [[ ${ENTER,,} = 'n' || ${ENTER,,} == 'no' ]] ; then exit fi fi fi fi # we just keep the current working directory PWD=$(pwd) ### give query and database absolute path if necessary if [[ ! $FASTAPATH == /* ]]; then FASTAPATH="$PWD/$FASTAPATH"; fi #if [[ ! $DATABASEPATH == /* ]]; then # DATABASEPATH="$PWD/$DATABASEPATH"; #fi FASTA="$FASTAPATH/$FASTAID" #DATABASE="$DATABASEPATH/$DATABASEID" ### setup blast and splitup executable; check if exist allowed_executables="blastx blastp blastn" if [[ ! $allowed_executables =~ (^|[[:space:]])"$_arg_executable"($|[[:space:]]) ]]; then # BLASTEXE=$(which $_arg_executable) #else error "$_arg_executable ought to be one of [$allowed_executables]" fi ### which is the reference directory size? _arg_ramdisk=$(du -shL --block-size=1M "$_arg_database" | cut -f1 )M if [[ ! $SCRIPT == /* ]]; then SCRIPT="$PWD/$SCRIPT"; fi # which cluster are we on? cluster=$(sacctmgr show cluster -p| tail -n1| cut -f1 -d '|') # if the cluster is Mogon I, set the memory default accordingly: if [ "$cluster" == "mogon" ]; then if [ $_arg_mem -ne 0 ]; then # user tries to select a non-default memory allowed_mem_setting="115500 242500 497500" if [[ ! $allowed_mem_settings =~ (^|[[:space:]])"_arg_mem"($|[[:space:]]) ]]; then error "Memory selection out to be one of [$allowed_mem_settings]" fi else # set a default memory if [ "$_arg_executable" == "blastn" ]; then _memory_request="242500M" else _memory_request="115500M" fi fi else if [ $_arg_mem -ne 0 ]; then # user tries to select a non-default memory allowed_mem_setting="115500 242500 497500" if [[ ! $allowed_mem_settings =~ (^|[[:space:]])"_arg_mem"($|[[:space:]]) ]]; then error "Memory selection out to be one of [$allowed_mem_settings]" fi else # set a default memory if [ "$_arg_executable" == "blastn" ]; then _memory_request="246000M" else _memory_request="120000M" fi fi fi # how many entries are there in the FASTA file? nentries=$(grep '>' $FASTA | wc -l) # we try to set the split number to a value, which ensures an output of # ~ 10.000 split files ### check if this script is on node by checking env-variable $SLURM_JOB_ID, else send it to SLURM with given parameters and exit if [ -z "$SLURM_JOB_ID" ]; then export SCRIPT_PATH=$(dirname $0) submit_statement="sbatch --no-requeue -o ${JOBTAG}_%j.out -J $JOBTAG -p $_arg_queue -A $_arg_assoc -t $_arg_runlimit -N $_arg_nodes -n $((64 * $_arg_nodes / $threads)) --mem=$_memory_request --ramdisk=${_arg_ramdisk} -c $threads" script_statement="$SCRIPT --partition $_arg_queue --account $_arg_assoc --nodes $_arg_nodes --time $_arg_runlimit --reservation=$_arg_reservation --threads $_arg_blast_threads --splitup $_arg_splitup_per_queryfile --blastparams=\"$BLASTPARAMS\" --executable=$_arg_executable $FASTA $DATABASE" if [ -n "$_arg_reservation" ]; then submit_statement="${submit_statement} --reservation=${_arg_reservation}" fi # paste them together submit_call="${submit_statement} ${script_statement}" if [[ $_arg_test == "on" ]]; then echo "Just testing - this command would be submitted:" echo $submit_call exit fi echo "sending job:" echo $submit_call eval $submit_call exit fi #### Self-Documentation echo "You are using $0, version $SCRIPT_VERSION" echo echo "Self-Documentation (on $(date))" [[ $_arg_test == "on" ]] && echo " - This is a test-run, only" echo " - The query input is '$FASTA'" echo " - The database is '$DATABASE'" BLASTVERSION=$($BLASTEXE -version | head -n1 | cut -f2 -d' ') echo " - The executable is '$BLASTEXE', version: $BLASTVERSION" echo " - Parameters to your call are: $BLASTPARAMS" ### set variables for lokal (node side) directories (working & ramdisk) JOBDIR=/localscratch/$SLURM_JOB_ID RAMDISK=$JOBDIR/ramdisk # copy DB onto ramdisk # TODO: change to sbcast, once available HOSTLIST=$(scontrol show hostname $SLURM_JOB_NODELIST | paste -d, -s | tr ',', ' ') QUEUE='' for HOST in $HOSTLIST; do if [ -L ${DATABASEPATH} ]; then warning "If the reference directory is a link, fast stage-in is not possible." for fname in ${DATABASEPATH}/*; do eval "ssh $HOST cp -L $fname ${RAMDISK}/$(basename $fname)" & PID=$! queue $PID done else for fname in ${DATABASEPATH}/*; do if [ -L "$fname" ]; then eval "ssh $HOST cp -L $fname ${RAMDISK}/$(basename $fname)" & PID=$! queue $PID else eval "ssh $HOST dd bs=4096 if=$fname of=${RAMDISK}/$(basename $fname)" & PID=$! queue $PID fi done fi # TODO: check for dereferencing links, before enabling # TODO: check for performance, before re-enabling #sbcast $FILE $RAMDISK/$(basename $FILE) done #DATABASE=$RAMDISK/$DATABASE DATABASE=$RAMDISK #/$(basename $DATABASEPATH) WORKDIR=$PWD/$BLASTDIR/$SLURM_JOB_NAME # this script may never output to a user's $HOME if [[ *"$WORKDIR"* = 'home' ]]; then eror "Cowardly refusing to operate in a home directory." fi # set path names to ease maintance SPLITFILEDIR=scratch ### check if exists, if no -> firsttime run, create subdirs and enter, split query file, else just enter if [ ! -d "$WORKDIR/$SPLITFILEDIR" ]; then mkdir -p "$WORKDIR/$SPLITFILEDIR" || exit 1; mkdir -p "$WORKDIR/output" || exit 1; cd "$WORKDIR" echo "executing scratch generator on $FASTA ($_arg_splitup_per_queryfile entries per file)" eval "${SCRIPT_PATH}/splitter.py $FASTA $_arg_splitup_per_queryfile" & # splitup queryfile PID=$! queue $PID fi # wait until the copy and a possible scratch generation are finished while [[ ! -z "$(echo $QUEUE| tr -d ' ')" ]]; do checkqueue sleep 5 done cd "$WORKDIR" # calculating the degree of parallelism is necessary in order not to oversaturate with srun processes. # As $SLURM_CPUS_PER_TASK can be unset, we need to set it in those cases if [[ -z "$SLURM_CPUS_PER_TASK" ]]; then declare -i SLURM_CPUS_PER_TASK=1 fi if [[ -z $DATABASE ]]; then error "Unable to recognize database, please get in touch with hpc@uni-mainz.de" fi # see whether we find a file in the db tmp=$(find $DATABASE -type f -print -quit) # remove the 2nd suffix DATABASE=${tmp%.*} ### a temporary script to conduct the alignment cmdfile=/localscratch/$SLURM_JOB_ID/dummy.sh cmdfilewriter chmod +x $cmdfile newcmd=/localscratch/$SLURM_JOBID/dummy_wrapper.sh sbcast $cmdfile $newcmd rm $cmdfile cmdfile=$newcmd ### append a finishing token to the samples samples+=('done') parallel="parallel --no-notice -j $SLURM_NTASKS -P $SLURM_NTASKS " srun="srun --cpu-bind=q --mem-bind=q -n 1 -N1 --exclusive -c $SLURM_CPUS_PER_TASK --jobid $SLURM_JOBID --mem-per-cpu=$((SLURM_MEM_PER_NODE / SLURM_CPUS_ON_NODE))" $parallel "$srun" "$cmdfile" ::: $(find $(pwd) -type f -name 'group*.fasta') wait n_unfinished_files=$(comm -3 <(cd output && find .| grep -o '[0-9]*' |sort ) <(cd scratch && find . | grep -o '[0-9]*' |sort )|wc -l) if [ $n_unfinished_files -eq 0 ] && [[ $_arg_compress == "on" ]] && [ $XMLOUT -eq 1 ]; then # shrink the alloction, such that only the minimum necessary is accounted for #scontrol update job=$SLURM_JOB_ID NumNodes=1 pwd # merge all xml files STARTC=$(date +%s.%N) outfile="${JOBTAG}.xml" # select the first of all files some_file=$(find ./output -name 'group*' | head -n1) zcat $some_file | head -n21 > $outfile # extract header information and write to outfile for split_file in ./output/group_*.xml.gz; do zcat $split_file | tail -n+22 |head -n-3 >> $outfile done # extract footer information and write to outfile zcat $some_file | tail -n3 >> $outfile pigz -p 16 $outfile & ENDC=$(date +%s.%N) elapsedc=$(bc <<< "scale=1; (($ENDC-$STARTC))/60") rm -rf $WORKDIR/$SPLITFILEDIR & #rm ./output/group_*.xml & #rm -rf ./scratch & wait elif [ $n_unfinished_files -eq 0 ] && [[ $_arg_compress == "on" ]] && [ $OUTOUT -eq 1 ]; then # shrink the alloction, such that only the minimum necessary is accounted for #scontrol update job=$SLURM_JOB_ID NumNodes=1 pwd # merge all xml files STARTC=$(date +%s.%N) outfile="${JOBTAG}.out" # select the first of all files some_file=$(find ./output -name 'group*' | head -n1) # write anything to the output file for split_file in ./output/group_*gz; do zcat $split_file >> $outfile rm $split_file done pigz -p 16 $outfile & ENDC=$(date +%s.%N) elapsedc=$(bc <<< "scale=1; (($ENDC-$STARTC))/60") rm -rf $WORKDIR/$SPLITFILEDIR & wait fi # marks the end of this run END=$(date +%s.%N) elapsed=$(bc <<< "scale=1; (($END-$START))/60") echo "parallel_BLAST took $elapsed minutes to run" # TODO: Check: 1 output item per input scratch file? # TODO: If not re-submit with correct/adjusted job size