LA_Wrapper 33.8 KB
Newer Older
1 2
#!/bin/bash

3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21
# Copyright (c)     -2014, Christoph Martin, JGU Mainz
#               2014-2019, Christian Meesters, JGU Mainz
# All rights reserved.
# 
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# 
#     * Redistributions of source code must retain the above copyright
#       notice, this list of conditions and the following disclaimer.
#     * Redistributions in binary form must reproduce the above copyright
#       notice, this list of conditions and the following disclaimer in the
#       documentation and/or other materials provided with the distribution.
#     * Neither the name of Christian Meesters or the JGU Mainz nor the names of
#       its contributors may be used to endorse or promote products derived
#       from this software without specific prior written permission.
# 
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
Christian Meesters's avatar
Christian Meesters committed
22
# ARE DISCLAIMED. IN NO EVENT SHALL CHRISTIAN MEESTERS OR THE JGU MAINZ BE LIABLE
23 24 25 26 27 28 29 30 31
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
# DAMAGE.


32
#set -x
33 34
set -e
PS4='Line ${LINENO}: '
35

Christian Meesters's avatar
Christian Meesters committed
36 37 38
# to measure the excecution time independent of SLURM
START=$(date +%s.%N)

39 40 41 42 43
# purging the module list, because interference with other modules
# should be avoided
module purge

# load the most current version of GNU parallel
Christian Meesters's avatar
Christian Meesters committed
44
module load tools/parallel/20190822
45

46
module load bio/Biopython/1.74-foss-2019a
Christian Meesters's avatar
Christian Meesters committed
47 48 49 50 51
#TODO: find a solution for the bug in BLAST+ AND to select the version by hand
module load bio/BLAST+/2.9.0-gompi-2019a
#module load bio/BLAST+/2.7.1-foss-2018a


52

53 54
### setup variable for THIS script; giving absolute path if necessary
SCRIPT="$0"
55
SCRIPT_VERSION="0.5.3"
56

57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
# TODO: delete the following 3 functions, once sbcast is working
function queue {
  QUEUE="$QUEUE $1"
  NUM=$((NUM+1))
}

function regeneratequeue {
  OLDREQUEUE=$QUEUE
  QUEUE=""
  NUM=0
  for PID in $OLDREQUEUE; do
    if [ -d /proc/$PID ]; then
      QUEUE="$QUEUE $PID"
      NUM=$((NUM+1))
    fi
  done
}

function checkqueue {
  OLDCHQUEUE=$QUEUE
  for PID in $OLDCHQUEUE; do
    if [ ! -d /proc/$PID ]; then
      regeneratequeue # at least one PID has finished
    fi
  done
}

die()
{
        local _ret=$2
        test -n "$_ret" || _ret=1
        test "$_PRINT_HELP" = yes && print_help >&2
        echo "$1" >&2
        exit ${_ret}
}

begins_with_short_option()
{
        local first_option all_short_options
        all_short_options='lqscmrh'
        first_option="${1:0:1}"
        test "$all_short_options" = "${all_short_options/$first_option/}" && return 1 || return 0
}

101 102 103 104 105 106 107 108
check_nucleotide_db() {
    echo "check not yet implemented"
}

check_protein_db() {
    echo "check not yet implemented"
}

109 110 111 112
# function to redirect simple error messages to stderr
error() {
    (>&2 echo "ERROR: $1") 
}
113

114 115 116 117
warning() {
    (>&2 echo "WARNING: $1") 
}

118 119 120 121 122 123
# THE DEFAULTS INITIALIZATION - POSITIONALS
_positionals=()
_arg_leftovers=()
# THE DEFAULTS INITIALIZATION - OPTIONALS
declare -i _arg_runlimit=300
_arg_queue=nodeshort
124
# TODO: ask for account, if string is NULL
125 126
_arg_assoc=$(sacct -nu $USER -o Account | tail -n1)
declare -i _arg_nodes=1
127
_arg_reservation=''
128 129
declare _memory_request=115500M
declare _arg_mem=0
130
declare -i _arg_blast_threads=1
131
_arg_blast_params=''
132
declare -i _arg_splitup_per_queryfile=0
133 134
declare _arg_ramdisk=40G
_arg_blastdir='.'
135
_arg_executable='blastx'
136
_arg_test=off
137
_arg_compress=on
138
_arg_debug=off
139 140 141 142

print_help ()
{
        echo "This script's help msg"
143
        printf 'Usage: %s [-l|--runlimit <arg>] [-p|--partition <arg>] [-N|--nodes <arg>] [--executable <arg>] [--blastparams <string>] [-r|--ramdisk <arg>] [--blastdir <arg>] [--(no-)test] [-h|--help] <FASTA> <DATABASE>\n' "$(basename $0)\n"
144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160
        printf 'HINT: The FASTA and DATABASE items need to be full paths to files.\n'
        printf "\\t\\033[1m%s\\033[0m\\t\\t%s\\n" "<FASTA>" "path to the query FASTA file"
        printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "<DATABASE>" "path to the database file"
        printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "-l,--runlimit" "runlimit default is 300 min, queue will be nodeshort, if <= 300 (default)"
        printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "-p,--partition" "queue (default is nodeshort)"
        printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "-A,--account" "SLURM account (default is the last submit account; an error is triggered if none specified nor can be deduced)"
        printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "-N,--nodes" "number of nodes  (1 is the default)"
        printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "--reservation" "reservation to use  (none is the default)"
        printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "--time" "time in minutes  (300 is the default)"
        printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "-t,--threads" "blast threads (default is 1)"
        printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "--blastparams" "blast parameters (default is -outfmt 6 (for blank tabulated output))"
        printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "--blastdir" "output directory (default is composition of input names)"
        printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "--executable" "choose executable (currently only from NCBI-BLAST, default: blastx)"
        printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "--compress" "if set, the output files will be merged and compressed (time consuming!, defaultt: off)"
        printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "--test,--no-test" "dry run, testing only (off by default)"
        printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "--credits,--version" "Prints credits and a brief version history and exits"
        printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "-h,--help" "Prints help"
161
        echo
162 163 164 165 166 167 168 169 170 171
        echo -e "\e[3mWARNINGS:\e[0m"
        echo -e "\e[3m- BLAST parameters:\e[0m"
        echo "  There are no BLAST default parameters, other than specifing XML output, the database and"
        echo "  queries as well as the numer of threads (all set by this script)."
        echo "  Detailed overview on parameters can be shown by entering <blastexecutable> -help"
        echo -e "\e[3m- Limitations:\e[0m"
        echo "  - Currently only NCBI-BLAST+ executables are supported."
        echo "  - References may be as big as 30G, 50G may also work."
        echo "    References beyond this this may or not work."
        echo "    Be sure to reserve sufficient RAM."
Christian Meesters's avatar
Christian Meesters committed
172 173 174 175 176 177 178 179 180 181
        echo 
        echo "Planned features for upcoming versions:"
        echo "- better, more stable user interface"
        echo "- support for alternate implementations, e.g. BLAT, diamond"
        echo "- automatic merging of xml output"
        echo "- restart capability to resume work in case of timelimits."
        echo "- beeond support for handling greater references"
        echo "- automatic reference database generation in case of version mismatches or similar"
        echo 
        echo "Minor improvements will be implemented as time permits. Yet, to make this possible,"
182
        echo -e "\e[1myou can request a collaboration\e[0m (for minor add ons an acknowladegement by name will do)."
183 184
}

Christian Meesters's avatar
Christian Meesters committed
185 186 187 188
credits()
{
        echo "The original implementation (2013/2014) was written by Christoph Martin (ZDV, UNIX group)"
        echo "Benjamin Rieger (Institut für Molekulargenetik) contributed a perl implementation"
189
        echo "of a format conform splitting of FASTA files, which is not used, anymore."
Christian Meesters's avatar
Christian Meesters committed
190
        echo "The original implemenation was a LSF chain job. It was eventually adopted and maintained by"
191
        echo "Christian Meesters (ZDV, HPC group) from 2017 onwards."
Christian Meesters's avatar
Christian Meesters committed
192
        echo 
193 194 195 196
        echo "I am particularly grateful for their feedback to:"
        echo "- Lukas Hellman (AG Hankeln)"
        echo "- Benjamin Rieger (NGS Facility)"
        echo 
Christian Meesters's avatar
Christian Meesters committed
197
        echo "History of the re-implementation:"
198 199 200 201
        echo "- v0.1   -- 27. Sep. 2017 -- release of the re-implementation for SLURM supporting the"
        echo "                             ability to compute accross nodes."
        echo "- v0.1.1 -- 19. Oct. 2017 -- bug fix: blast parameters now correctly transferred to"
        echo "                             blast(x)."
Christian Meesters's avatar
Christian Meesters committed
202
        echo "- v0.1.2 -- 25. Oct. 2017 -- minor fixes and clarifications"
203
        echo "- v0.2.b -- 18. Jan. 2018 -- new features:"
204 205 206
        echo "                             - possible to choose between NCBI-BLAST executables"
        echo "                             - better user feedback / self documentation \(at least a start\)"
        echo "                             - error messages are now directed to stderr"
207
        echo "                             - jobs may use reservations (e.g. for courses)"
208 209 210
        echo "                             changes:"
        echo "                             - deleted default evalue and max_target_seqs settings"
        echo "                             - XML output is mandatory"
211 212
        echo "                             fix:"
        echo "                             - introduced missing --time option"
213
        echo "                             - changed the blast parameter interface"
Christian Meesters's avatar
Christian Meesters committed
214
        echo "- v0.3   -- 21. Feb. 2018 -- faster merging of output files, parallel zipping"
215 216 217 218
        echo "- v0.3.1 -- 15. May  2018 -- bugfixes in handling the blast executables"
        echo "- v0.3.2 -- 16. Jan. 2019 -- hot fix for new ramdisk and slurmstepd support"
        echo "- v0.4   -- 06. Mar. 2019 -- refactored version:"
        echo "                             - executables now pluggable"
219
        echo "- v0.5   -- 21. Aug. 2019 -- fix: parser did not work for '--mem'-arg properly"
220 221 222 223 224
        echo "                             update: - clearer UI"
        echo "                                     - better default memory settings"
        echo "                                     - faster stage-in for reference data"
        echo "                                     - automerge for -outfmt=6"
        echo "                                     - -outfmt=6 is now the default"
225 226 227 228
        echo "- v0.5.1 -- 29. Aug. 2019 -- numerous housekeeping fixes"
        echo "- v0.5.2 -- 02. Sep. 2019 -- fix:"
        echo "                             - consistent biopython inclusion"
        echo "                             - auto-detection of database size and memory selection"
229
        echo "- v0.5.3 -- 24. Sep. 2019 -- fix: dereferencing of reference / database files re-enabled"
230 231
        echo 
        echo "Current version is: $SCRIPT_VERSION"
Christian Meesters's avatar
Christian Meesters committed
232 233 234 235 236
        echo
        echo "The re-implementation supporting parallel BLAST+ jobs accross several compute nodes has been"
        echo "written and is maintained by Christian Meesters (ZDV, HPC group)."
        echo "A (personal) acknowledgement is welcomed. Please refer to:"
        echo "https://hpc.uni-mainz.de/high-performance-computing/publikationen/"
237
        exit
Christian Meesters's avatar
Christian Meesters committed
238 239
}

240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308
# command line parsing
while test $# -gt 0
do
        _key="$1"
        case "$_key" in
                -l*|--runlimit|--runlimit=*)
                        _val_from_long="${_key##--runlimit=}"
                        _val_from_short="${_key##-l}"
                        if test "$_val_from_long" != "$_key"
                        then
                                _val="$_val_from_long"
                        elif test "$_val_from_short" != "$_key" -a -n "$_val_from_short"
                        then
                                _val="$_val_from_short"
                        else
                                test $# -lt 2 && die "Missing value for the optional argument '$_key'." 1
                                _val="$2"
                                shift
                        fi
                        _arg_runlimit="$_val"
                        ;;
                -p*|--partition|--partition=*)
                        _val_from_long="${_key##--partition=}"
                        _val_from_short="${_key##-p}"
                        if test "$_val_from_long" != "$_key"
                        then
                                _val="$_val_from_long"
                        elif test "$_val_from_short" != "$_key" -a -n "$_val_from_short"
                        then
                                _val="$_val_from_short"
                        else
                                test $# -lt 2 && die "Missing value for the optional argument '$_key'." 1
                                _val="$2"
                                shift
                        fi
                        _arg_queue="$_val"
                        ;;
                -A*|--account|--account=*)
                        _val_from_long="${_key##--account=}"
                        _val_from_short="${_key##-A}"
                        if test "$_val_from_long" != "$_key"
                        then
                                _val="$_val_from_long"
                        elif test "$_val_from_short" != "$_key" -a -n "$_val_from_short"
                        then
                                _val="$_val_from_short"
                        else
                                test $# -lt 2 && die "Missing value for the optional argument '$_key'." 1
                                _val="$2"
                                shift
                        fi
                        _arg_assoc="$_val"
                        ;;
                -N*|--nodes|--nodes=*)
                        _val_from_long="${_key##--nodes=}"
                        _val_from_short="${_key##-N}"
                        if test "$_val_from_long" != "$_key"
                        then
                                _val="$_val_from_long"
                        elif test "$_val_from_short" != "$_key" -a -n "$_val_from_short"
                        then
                                _val="$_val_from_short"
                        else
                                test $# -lt 2 && die "Missing value for the optional argument '$_key'." 1
                                _val="$2"
                                shift
                        fi
                        _arg_nodes="$_val"
                        ;;
309 310 311 312 313 314 315 316 317 318
                --reservation|--reservation=*)
                        _val="${_key##--reservation=}"
                        if test "$_val" = "$_key"
                        then
                                test $# -lt 2 && die "Missing value for the optional argument '$_key'." 1
                                _val="$2"
                                shift
                        fi
                        _arg_reservation="$_val"
                        ;;
319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334
                --time|--time=*)
                        _val_from_long="${_key##--time=}"
                        _val_from_short="${_key##-N}"
                        if test "$_val_from_long" != "$_key"
                        then
                                _val="$_val_from_long"
                        elif test "$_val_from_short" != "$_key" -a -n "$_val_from_short"
                        then
                                _val="$_val_from_short"
                        else
                                test $# -lt 2 && die "Missing value for the optional argument '$_key'." 1
                                _val="$2"
                                shift
                        fi
                        _arg_runlimit="$_val"
                        ;;
Christian Meesters's avatar
Christian Meesters committed
335 336
                -m*|--mem|--mem=*)
                        _val_from_long="${_key##--mem=}"
337 338 339 340 341 342 343 344
                        if test "$_val_from_long" != "$_key"
                        then
                                _val="$_val_from_long"
                        else
                                test $# -lt 2 && die "Missing value for the optional argument '$_key'." 1
                                _val="$2"
                                shift
                        fi
Christian Meesters's avatar
Christian Meesters committed
345
                        _arg_mem="$_val"
346
                        ;;
347 348 349 350 351 352 353 354 355 356 357 358
                --blastparams|--blastparams=*)
                        _val_from_long="${_key##--blastparams=}"
                        if test "$_val_from_long" != "$_key"
                        then
                                _val="$_val_from_long"
                        else
                                test $# -lt 2 && die "Missing value for the optional argument '$_key'." 1
                                _val="$2"
                                shift
                        fi
                        _arg_blastparams="$_val"
                        ;;
359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400
                -t*|--threads|--threads=*)
                        _val_from_long="${_key##--threads=}"
                        _val_from_short="${_key##-t}"
                        if test "$_val_from_long" != "$_key"
                        then
                                _val="$_val_from_long"
                        elif test "$_val_from_short" != "$_key" -a -n "$_val_from_short"
                        then
                                _val="$_val_from_short"
                        else
                                test $# -lt 2 && die "Missing value for the optional argument '$_key'." 1
                                _val="$2"
                                shift
                        fi
                        _arg_blast_threads="$_val"
                        ;;
                -s*|--splitup|--splitup=*)
                        _val_from_long="${_key##--splitup=}"
                        _val_from_short="${_key##-s}"
                        if test "$_val_from_long" != "$_key"
                        then
                                _val="$_val_from_long"
                        elif test "$_val_from_short" != "$_key" -a -n "$_val_from_short"
                        then
                                _val="$_val_from_short"
                        else
                                test $# -lt 2 && die "Missing value for the optional argument '$_key'." 1
                                _val="$2"
                                shift
                        fi
                        _arg_splitup_per_queryfile="$_val"
                        ;;
                --blastdir|--blastdir=*)
                        _val="${_key##--blastdir=}"
                        if test "$_val" = "$_key"
                        then
                                test $# -lt 2 && die "Missing value for the optional argument '$_key'." 1
                                _val="$2"
                                shift
                        fi
                        _arg_blastdir="$_val"
                        ;;
401 402 403 404 405 406 407 408 409 410
                --executable|--executable=*)
                        _val="${_key##--executable=}"
                        if test "$_val" = "$_key"
                        then
                                test $# -lt 2 && die "Missing value for the optional argument '$_key'." 1
                                _val="$2"
                                shift
                        fi
                        _arg_executable="$_val"
                        ;;
Christian Meesters's avatar
Christian Meesters committed
411 412 413 414
                --no-compress|--compress)
                        _arg_compress="on"
                        test "${1:0:5}" = "--no-" && _arg_compress="off"
                        ;;
415 416 417 418
                --no-test|--test)
                        _arg_test="on"
                        test "${1:0:5}" = "--no-" && _arg_test="off"
                        ;;
419 420 421 422
                --debug)
                        _arg_debug="on"
                        set -x
                        ;;
423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447
                --credits|--version)
                        credits
                        exit 0
                        ;;
                -h*|--help)
                        print_help
                        exit 0
                        ;;
                *)
                        _positionals+=("$1")
                        ;;
        esac
        shift
done

_positional_names=('_arg_fasta' '_arg_database' )
_required_args_string="'FASTA' and 'DATABASE'"
#test ${#_positionals[@]} -lt 2 && _PRINT_HELP=yes die "FATAL ERROR: Not enough positional arguments - we require exactly 2 (namely: $_required_args_string), but got only ${#_positionals[@]}." 1
#test ${#_positionals[@]} -gt 2 && _PRINT_HELP=yes die "FATAL ERROR: There were spurious positional arguments --- we expect exactly 2 (namely: $_required_args_string), but got ${#_positionals[@]} (the last one was: '${_positionals[*]: -1}')." 1
#for (( ii = 0; ii < ${#_positionals[@]}; ii++))
for (( ii = 0; ii < 2; ii++))
do
        eval "${_positional_names[ii]}=\${_positionals[ii]}" || die "Error during argument parsing, possibly an Argbash bug." 1
done
_our_args=$((${#_positionals[@]} - ${#_positional_names[@]}))
448 449
for (( ii = 0; ii < _our_args; ii++)); do
    _positional_names+=("_arg_leftovers[(($ii + 1))]")
450 451 452 453
done

### get query and database files
FASTA=$_arg_fasta
454
DATABASE=$(realpath $_arg_database)
455 456

### check if query & database exist
457
if [[ $_arg_test == "off" ]] && [ ! -e "$FASTA" ]; then
458
    error "FASTA input was: '$FASTA' - no such file!"
459 460 461
    exit 1
fi

462 463 464 465 466
if [[ $_arg_test == "off" ]] && [ ! -d "$DATABASE" ]; then
    error "DATABASE input was: '$DATABASE' - no such directory!"
    exit 1
fi

467 468 469
if [ "blastx" = "${_arg_executable,,}" ]; then
    executable="blastx"
    threads=2
470
    DBSUFFIX=".nal" # db suffix to be removed from nal file - not working, if nal file not present
471 472 473 474 475 476 477 478
    if [ -z "$SLURM_JOB_ID" ]; then
        source $(dirname "$0")/blast_wrap.sh
    else
        source "${SCRIPT_PATH}"/blast_wrap.sh
    fi
elif [ "blastn" = "${_arg_executable,,}" ]; then
    executable="blastn"
    threads=8
479
    #check_nucleotide_db
480 481 482 483 484 485 486 487
    if [ -z "$SLURM_JOB_ID" ]; then
        source $(dirname "$0")/blast_wrap.sh
    else
        source "${SCRIPT_PATH}"/blast_wrap.sh
    fi
elif [ "blastp" = "${_arg_executable,,}" ]; then
    executable="blastp"
    threads=2
488
    #check_protein_db
489 490 491 492 493 494 495 496 497 498
    if [ -z "$SLURM_JOB_ID" ]; then
        source $(dirname "$0")/blast_wrap.sh
    else
        source "${SCRIPT_PATH}"/blast_wrap.sh
    fi
else 
    error "executable '$_arg_executable' not recognized."
    print_help
    exit 2
fi   
499 500 501

### prepare filepath and -names for creating working folder
FASTAPATH=$(dirname $FASTA)
502
DATABASEPATH=$(realpath $DATABASE)
503 504 505 506 507 508 509 510 511

FASTAID=$(basename $FASTA)
DATABASEID=$(basename $DATABASE)

FA=${FASTAID%%.*}
DB=${DATABASEID%.*}

JOBTAG="BLAST_${FA}_VS_${DB}"

512 513 514 515 516 517 518 519 520 521 522
# how many entries are there in the FASTA file?
echo "Checking input file"
nentries=$(grep '>' $FASTA | wc -l)
# we try to set the split number to a value, which ensures an output of 
# ~ 10.000 split files
if [ $_arg_splitup_per_queryfile -ne 0 ]; then # the user thinks differently?
    nsplits=$((nentries / _arg_splitup_per_queryfile))
    if [ $nsplits -gt 50000 ]; then
        error "There would be more than '$nsplits' files in scratch."
        exit 1
    elif [ $nsplits -gt 15000 ]; then
523
        warning "There will be '$nsplits' files in scratch -- resulting in poor performance."
524 525
    fi
else # infer the value 
526
    _arg_splitup_per_queryfile=$((nentries / 5000))
527 528
fi

529 530
# default values, see:
#  https://www.ncbi.nlm.nih.gov/books/NBK279675/
531
DEFAULT_BLASTPARAMS='-outfmt 6'
532 533 534 535
# sanity check: '-outfmt' in blast parameters?
if [[ "$_arg_blastparams" =~ "outfmt" ]]; then
    BLASTPARAMS=$_arg_blastparams
else
Christian Meesters's avatar
Christian Meesters committed
536 537
    BLASTPARAMS="${_arg_blastparams} $DEFAULT_BLASTPARAMS"
fi
538 539

### testing for output options - to be used later 
Christian Meesters's avatar
Christian Meesters committed
540
# test whether the output is xml or not
541
if [[ '-outfmt 5' =~ $(echo -e "${BLASTPARAMS}" | sed -e 's/^[[:space:]]*//') ]]; then
Christian Meesters's avatar
Christian Meesters committed
542
    XMLOUT=1
543
    OUTOUT=0
544 545
# test whether the output is plain tabular or not
elif [[ '-outfmt 6' =~ $(echo -e "${BLASTPARAMS}" | sed -e 's/^[[:space:]]*//') ]]; then
Christian Meesters's avatar
Christian Meesters committed
546
    XMLOUT=0
547
    OUTOUT=1
548
fi
549

550
# TODO: port to M2
551 552 553 554 555 556 557 558 559 560 561 562 563 564
### Auto-Adjust the Queue
if [ $_arg_runlimit -le 300 ]; then
    if [ "$_arg_queue" != "nodeshort" ]; then
        _arg_queue="${_arg_queue/long/short}"
    fi
else
    if [ "$_arg_queue" != "nodeslong" ]; then
        _arg_queue="${_arg_queue/short/long}"
    fi
fi

### check if working directory already exists
if [ -z "$SLURM_JOB_ID" ] && [[ $_arg_test == "off" ]]; then
    if [ -d "$JOBTAG" ]; then
565
        echo "$JOBTAG : directory already exists! Remove? (y/[n])"
566 567 568
        echo -n '>'
        read ENTER
        
569
        if [[ ${ENTER,,} = 'y' ||  ${ENTER,,} == 'yes' ]]; then
570 571 572
            echo "removing directory $JOBTAG"
            rm -r $JOBTAG
        else
Christian Meesters's avatar
Christian Meesters committed
573
            echo "So, you want to continue regardless (using the existing scratch files)? ([y]/n)"
574 575
            echo -n '>'
            read ENTER
Christian Meesters's avatar
Christian Meesters committed
576
            if [[ ${ENTER,,} = 'n' || ${ENTER,,} == 'no' ]] ; then
577 578
                exit
            fi
579 580 581 582 583 584 585 586 587 588 589 590 591 592 593
        fi
    fi
fi

# we just keep the current working directory
PWD=$(pwd)

### give query and database absolute path if necessary
if [[ ! $FASTAPATH == /* ]]; then
    FASTAPATH="$PWD/$FASTAPATH";
fi

FASTA="$FASTAPATH/$FASTAID"

### setup blast and splitup executable; check if exist
594
allowed_executables="blastx blastp blastn"
Christian Meesters's avatar
Christian Meesters committed
595 596 597
if [[ ! $allowed_executables =~ (^| [[:space:]])"$_arg_executable"($| )  ]]; then
    BLASTEXE=$(which $_arg_executable)
else
598
    error "$_arg_executable ought to be one of [$allowed_executables]"
Christian Meesters's avatar
Christian Meesters committed
599
    exit 1
600
fi
Christian Meesters's avatar
Christian Meesters committed
601
export _arg_executable
602

603
### which is the reference directory size?
604
_arg_ramdisk=$(du -shL --block-size=1M "$_arg_database" | cut -f1 )
605
if [[ ! $SCRIPT == /* ]]; then
606
   SCRIPT="$PWD/$SCRIPT";
607 608
fi

609 610 611
# which cluster are we on?
cluster=$(sacctmgr show cluster -p| tail -n1| cut -f1 -d '|')
# if the cluster is Mogon I, set the memory default accordingly:
612
allowed_mem_setting=""
613
if [ "$cluster" == "mogon" ]; then
614 615 616 617 618
    allowed_mem_settings="115500 242500 497500"
    # add a savety measure (100 MB, each core)
    for setting in $allowed_mem_settings;  do
        if [ $((_arg_ramdisk + 6400 )) -lt $setting ]; then
            allowed_mem_setting="$allowed_mem_setting $setting"
619
        fi
620 621 622 623 624 625 626 627 628 629 630 631
    done
    # test whether there is any valid setting left
    if [ -z "$allowed_mem_setting" ]; then
        error "database > available + necessary RAM"
        exit 1
    fi
    # remove first space, if any
    allowed_mem_setting="${allowed_mem_setting/ /}"
    if [ $_arg_mem -ne 0 ]; then # user tries to select a non-default memory
        if [[ ! "$allowed_mem_setting" =~ (^|[[:space:]])"$_arg_mem"($|[[:space:]]) ]]; then
           error "Memory selection out to be one of [$allowed_mem_setting]"
           exit 1
632
        fi
633 634 635 636 637 638 639 640
    # select a default
    else 
        _memory_request=$(echo $allowed_mem_setting | cut -d" " -f1)M
        #if [ "$_arg_executable" == "blastn" ]; then
        #    _memory_request="242500M"
        #else
        #    _memory_request="115500M"
        #fi
641
    fi
642
else # to be implemented for MII
643 644 645 646 647 648 649 650 651 652 653 654 655
    if [ $_arg_mem -ne 0 ]; then # user tries to select a non-default memory
        allowed_mem_setting="115500 242500 497500"
        if [[ ! $allowed_mem_settings =~ (^|[[:space:]])"_arg_mem"($|[[:space:]]) ]]; then
            error "Memory selection out to be one of [$allowed_mem_settings]"
        fi
    else # set a default memory
        if [ "$_arg_executable" == "blastn" ]; then
            _memory_request="246000M"
        else
            _memory_request="120000M"
        fi
    fi
fi
656 657
# finally add another MB to the ramdisk to be save and the unit
_arg_ramdisk=$((_arg_ramdisk + 1024 ))M
658

659
### how many entries are there in the FASTA file?
660 661 662 663 664
nentries=$(grep '>' $FASTA | wc -l)
# we try to set the split number to a value, which ensures an output of 
# ~ 10.000 split files


665 666
### check if this script is on node by checking env-variable $SLURM_JOB_ID, else send it to SLURM with given parameters and exit
if [ -z "$SLURM_JOB_ID" ]; then
667
    export SCRIPT_PATH=$(dirname $0)
Christian Meesters's avatar
Christian Meesters committed
668
    submit_statement="sbatch -o ${JOBTAG}_%j.out -J $JOBTAG -p $_arg_queue -A $_arg_assoc -t $_arg_runlimit -N $_arg_nodes -n $((64 * $_arg_nodes / $threads)) --mem=$_memory_request --ramdisk=${_arg_ramdisk} -c $threads"
669

670 671 672 673 674 675 676
    script_statement="$SCRIPT --partition $_arg_queue --account $_arg_assoc --nodes $_arg_nodes --time $_arg_runlimit --reservation=$_arg_reservation  --threads $_arg_blast_threads --splitup $_arg_splitup_per_queryfile --blastparams=\"$BLASTPARAMS\" --executable=$_arg_executable "
    if [[ $_arg_debug == "on" ]]; then
        script_statement="${script_statement} --debug"
    fi
    # supply the input files regarless of any user request:
    script_statement="${script_statement} $FASTA $DATABASE"

677 678 679 680 681 682
    if [ -n "$_arg_reservation" ]; then
        submit_statement="${submit_statement} --reservation=${_arg_reservation}"
    fi

    # paste them together
    submit_call="${submit_statement} ${script_statement}"
683 684 685 686 687
    if [[ $_arg_test == "on" ]]; then
        echo "Just testing - this command would be submitted:"
        echo $submit_call
        exit
    fi
688
    echo "sending job:"
689
    echo $submit_call
690
    eval $submit_call
691 692 693
    exit
fi

694 695 696 697 698 699 700 701 702 703 704
#### Self-Documentation
echo "You are using $0, version $SCRIPT_VERSION"
echo 
echo "Self-Documentation (on $(date))"
[[ $_arg_test == "on" ]] && echo "  - This is a test-run, only"
echo "  - The query input is '$FASTA'"
echo "  - The database is '$DATABASE'"
BLASTVERSION=$($BLASTEXE -version | head -n1 | cut -f2 -d' ')
echo "  - The executable is '$BLASTEXE', version: $BLASTVERSION"
echo "  - Parameters to your call are: $BLASTPARAMS"

705 706 707 708 709 710 711 712 713
### set variables for lokal (node side) directories (working & ramdisk)
JOBDIR=/localscratch/$SLURM_JOB_ID
RAMDISK=$JOBDIR/ramdisk

# copy DB onto ramdisk
# TODO: change to sbcast, once available
HOSTLIST=$(scontrol show hostname $SLURM_JOB_NODELIST | paste -d, -s | tr ',', ' ')
QUEUE=''

Christian Meesters's avatar
Christian Meesters committed
714 715 716 717 718 719 720 721 722 723 724 725
#myhost=$(hostname -f)
stagefile=/localscratch/$SLURM_JOB_ID/dummy_stagein.sh
rstagefile=/localscratch/$SLURM_JOB_ID/stagein.sh
source "${SCRIPT_PATH}"/stage_in.sh
stage_in_writer
chmod +x $stagefile

# distribute the stagewriter
sbcast $stagefile $rstagefile

rm $stagefile
stagefile=$rstagefile
726

Christian Meesters's avatar
Christian Meesters committed
727 728
# we would not need this loop with regard to slurm, but as we have
# asynchronous tasks already, we keep track with the queue
729
for HOST in $HOSTLIST; do
Christian Meesters's avatar
Christian Meesters committed
730 731
   srun -w $HOST -N1 -n1 -c1 --mem-per-cpu=5000M $stagefile &
   queue $!
732 733 734
done

WORKDIR=$PWD/$BLASTDIR/$SLURM_JOB_NAME
735 736
# this script may never output to a user's $HOME
if [[ *"$WORKDIR"* = 'home' ]]; then
Christian Meesters's avatar
Christian Meesters committed
737
    error "Cowardly refusing to operate in a home directory."
738
fi
739 740 741 742 743 744 745
# set path names to ease maintance
SPLITFILEDIR=scratch

### check if exists, if no -> firsttime run, create subdirs and enter, split query file, else just enter
if [ ! -d "$WORKDIR/$SPLITFILEDIR" ]; then
    mkdir -p "$WORKDIR/$SPLITFILEDIR" || exit 1;
    mkdir -p "$WORKDIR/output" || exit 1;
746
    cd "$WORKDIR"
747
    echo "executing scratch generator on $FASTA ($_arg_splitup_per_queryfile entries per file)"
748
    eval "${SCRIPT_PATH}/splitter.py $FASTA $_arg_splitup_per_queryfile " &     # splitup queryfile
Christian Meesters's avatar
Christian Meesters committed
749
    queue $!
750 751
fi

752 753 754 755 756 757
# wait until the copy and a possible scratch generation are finished
while  [[ ! -z "$(echo $QUEUE| tr -d ' ')" ]]; do
   checkqueue
   sleep 5
done

758 759 760 761 762 763
DATABASE=$(find $RAMDISK -type f -print -quit)
# strip suffix - if more than one dot
while [ $(grep -o "\." <<< $DATABASE | wc -l) -gt 1 ]; do
   DATABASE=${DATABASE%.*}
done

Christian Meesters's avatar
Christian Meesters committed
764 765 766 767 768
if [[ -z $DATABASE ]]; then
    error "Unable to recognize database, please get in touch with hpc@uni-mainz.de"
    exit 1
fi

769 770 771 772 773 774 775 776
cd "$WORKDIR"

# calculating the degree of parallelism is necessary in order not to oversaturate with srun processes.
# As $SLURM_CPUS_PER_TASK can be unset, we need to set it in those cases
if [[ -z "$SLURM_CPUS_PER_TASK" ]]; then
   declare -i SLURM_CPUS_PER_TASK=1
fi

777 778 779 780 781 782 783 784 785 786 787 788
### a temporary script to conduct the alignment
cmdfile=/localscratch/$SLURM_JOB_ID/dummy.sh
cmdfilewriter

chmod +x $cmdfile

newcmd=/localscratch/$SLURM_JOBID/dummy_wrapper.sh
sbcast $cmdfile $newcmd

rm $cmdfile
cmdfile=$newcmd

789
samples=$(find $(pwd) -type f -name 'group*.fasta')
790 791
### append a finishing token to the samples
samples+=('done')
Christian Meesters's avatar
Christian Meesters committed
792 793 794

parallel="parallel --no-notice -j $SLURM_NTASKS -P $SLURM_NTASKS "

795
srun="srun --cpu-bind=q --mem-bind=q -n 1 -N1 --exclusive -c $SLURM_CPUS_PER_TASK --jobid $SLURM_JOBID --mem-per-cpu=$((SLURM_MEM_PER_NODE / SLURM_CPUS_ON_NODE))"
796
$parallel "$srun" "$cmdfile" ::: $samples
797

798
n_unfinished_files=$(comm -3 <(cd output && find .| grep -o '[0-9]*' |sort ) <(cd scratch && find . | grep -o '[0-9]*' |sort )|wc -l)
Christian Meesters's avatar
Christian Meesters committed
799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815
if [ $n_unfinished_files -eq 0 ] && [[ $_arg_compress == "on" ]] && [ $XMLOUT -eq 1 ]; then
    # shrink the alloction, such that only the minimum necessary is accounted for
    #scontrol update job=$SLURM_JOB_ID NumNodes=1  
    pwd
    # merge all xml files
    STARTC=$(date +%s.%N)
    outfile="${JOBTAG}.xml"
    # select the first of all files
    some_file=$(find ./output -name 'group*' | head -n1)
    zcat $some_file | head -n21 > $outfile
    # extract header information and write to outfile
    for split_file in ./output/group_*.xml.gz; do
        zcat $split_file | tail -n+22  |head -n-3 >> $outfile
    done
    # extract footer information and write to outfile
    zcat $some_file | tail -n3 >> $outfile
    pigz -p 16 $outfile &
816
    rm -rf $WORKDIR/$SPLITFILEDIR &
817
    rm ./output/group_*.xml &
818
    wait
819 820 821
    ENDC=$(date +%s.%N)
    elapsedc=$(bc <<< "scale=1; (($ENDC-$STARTC))/60")
# merge all standard output files (outfmt -6 -- tabular output) files
822 823 824 825 826 827
elif [ $n_unfinished_files -eq 0 ] && [[ $_arg_compress == "on" ]] && [ $OUTOUT -eq 1 ]; then
    STARTC=$(date +%s.%N)
    outfile="${JOBTAG}.out"
    # write anything to the output file
    for split_file in ./output/group_*gz; do
        zcat $split_file >> $outfile
828
        rm $split_file
829 830 831
    done
    pigz -p 16 $outfile &
    rm -rf $WORKDIR/$SPLITFILEDIR &
832
    rmdir ./output &
833
    wait
834 835
    ENDC=$(date +%s.%N)
    elapsedc=$(bc <<< "scale=1; (($ENDC-$STARTC))/60")
836
fi
Christian Meesters's avatar
Christian Meesters committed
837 838 839 840 841

# marks the end of this run
END=$(date +%s.%N)
elapsed=$(bc <<< "scale=1; (($END-$START))/60")

842
echo "parallel_BLAST took $elapsed minutes to run; the compression took $elapsedc minutes"
843
    
844 845
# TODO: Check: 1 output item per input scratch file?
# TODO: If not re-submit with correct/adjusted job size