LA_Wrapper 33.5 KB
Newer Older
1 2
#!/bin/bash

3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21
# Copyright (c)     -2014, Christoph Martin, JGU Mainz
#               2014-2019, Christian Meesters, JGU Mainz
# All rights reserved.
# 
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# 
#     * Redistributions of source code must retain the above copyright
#       notice, this list of conditions and the following disclaimer.
#     * Redistributions in binary form must reproduce the above copyright
#       notice, this list of conditions and the following disclaimer in the
#       documentation and/or other materials provided with the distribution.
#     * Neither the name of Christian Meesters or the JGU Mainz nor the names of
#       its contributors may be used to endorse or promote products derived
#       from this software without specific prior written permission.
# 
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
Christian Meesters's avatar
Christian Meesters committed
22
# ARE DISCLAIMED. IN NO EVENT SHALL CHRISTIAN MEESTERS OR THE JGU MAINZ BE LIABLE
23 24 25 26 27 28 29 30 31
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
# DAMAGE.


32
#set -x
33 34
set -e
PS4='Line ${LINENO}: '
35

Christian Meesters's avatar
Christian Meesters committed
36 37 38
# to measure the excecution time independent of SLURM
START=$(date +%s.%N)

39 40 41 42 43
# purging the module list, because interference with other modules
# should be avoided
module purge

# load the most current version of GNU parallel
Christian Meesters's avatar
Christian Meesters committed
44
module load tools/parallel/20190822
45

46
module load bio/Biopython/1.74-foss-2019a
Christian Meesters's avatar
Christian Meesters committed
47 48 49 50 51
#TODO: find a solution for the bug in BLAST+ AND to select the version by hand
module load bio/BLAST+/2.9.0-gompi-2019a
#module load bio/BLAST+/2.7.1-foss-2018a


52

53 54
### setup variable for THIS script; giving absolute path if necessary
SCRIPT="$0"
Christian Meesters's avatar
Christian Meesters committed
55
SCRIPT_VERSION="0.5.1"
56

57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
# TODO: delete the following 3 functions, once sbcast is working
function queue {
  QUEUE="$QUEUE $1"
  NUM=$((NUM+1))
}

function regeneratequeue {
  OLDREQUEUE=$QUEUE
  QUEUE=""
  NUM=0
  for PID in $OLDREQUEUE; do
    if [ -d /proc/$PID ]; then
      QUEUE="$QUEUE $PID"
      NUM=$((NUM+1))
    fi
  done
}

function checkqueue {
  OLDCHQUEUE=$QUEUE
  for PID in $OLDCHQUEUE; do
    if [ ! -d /proc/$PID ]; then
      regeneratequeue # at least one PID has finished
    fi
  done
}

die()
{
        local _ret=$2
        test -n "$_ret" || _ret=1
        test "$_PRINT_HELP" = yes && print_help >&2
        echo "$1" >&2
        exit ${_ret}
}

begins_with_short_option()
{
        local first_option all_short_options
        all_short_options='lqscmrh'
        first_option="${1:0:1}"
        test "$all_short_options" = "${all_short_options/$first_option/}" && return 1 || return 0
}

101 102 103 104 105 106 107 108
check_nucleotide_db() {
    echo "check not yet implemented"
}

check_protein_db() {
    echo "check not yet implemented"
}

109 110 111 112
# function to redirect simple error messages to stderr
error() {
    (>&2 echo "ERROR: $1") 
}
113

114 115 116 117
warning() {
    (>&2 echo "WARNING: $1") 
}

118 119 120 121 122 123
# THE DEFAULTS INITIALIZATION - POSITIONALS
_positionals=()
_arg_leftovers=()
# THE DEFAULTS INITIALIZATION - OPTIONALS
declare -i _arg_runlimit=300
_arg_queue=nodeshort
124
# TODO: ask for account, if string is NULL
125 126
_arg_assoc=$(sacct -nu $USER -o Account | tail -n1)
declare -i _arg_nodes=1
127
_arg_reservation=''
128 129
declare _memory_request=115500M
declare _arg_mem=0
130
declare -i _arg_blast_threads=1
131
_arg_blast_params=''
132
declare -i _arg_splitup_per_queryfile=0
133 134
declare _arg_ramdisk=40G
_arg_blastdir='.'
135
_arg_executable='blastx'
136
_arg_test=off
137
_arg_compress=on
138 139 140 141

print_help ()
{
        echo "This script's help msg"
142
        printf 'Usage: %s [-l|--runlimit <arg>] [-p|--partition <arg>] [-N|--nodes <arg>] [--executable <arg>] [--blastparams <string>] [-r|--ramdisk <arg>] [--blastdir <arg>] [--(no-)test] [-h|--help] <FASTA> <DATABASE>\n' "$(basename $0)\n"
143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159
        printf 'HINT: The FASTA and DATABASE items need to be full paths to files.\n'
        printf "\\t\\033[1m%s\\033[0m\\t\\t%s\\n" "<FASTA>" "path to the query FASTA file"
        printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "<DATABASE>" "path to the database file"
        printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "-l,--runlimit" "runlimit default is 300 min, queue will be nodeshort, if <= 300 (default)"
        printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "-p,--partition" "queue (default is nodeshort)"
        printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "-A,--account" "SLURM account (default is the last submit account; an error is triggered if none specified nor can be deduced)"
        printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "-N,--nodes" "number of nodes  (1 is the default)"
        printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "--reservation" "reservation to use  (none is the default)"
        printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "--time" "time in minutes  (300 is the default)"
        printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "-t,--threads" "blast threads (default is 1)"
        printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "--blastparams" "blast parameters (default is -outfmt 6 (for blank tabulated output))"
        printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "--blastdir" "output directory (default is composition of input names)"
        printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "--executable" "choose executable (currently only from NCBI-BLAST, default: blastx)"
        printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "--compress" "if set, the output files will be merged and compressed (time consuming!, defaultt: off)"
        printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "--test,--no-test" "dry run, testing only (off by default)"
        printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "--credits,--version" "Prints credits and a brief version history and exits"
        printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "-h,--help" "Prints help"
160
        echo
Christian Meesters's avatar
Christian Meesters committed
161 162 163 164 165 166 167 168 169 170
        echo -e "\e[3mWARNINGS:\e[0m"
        echo -e "\e[3m- BLAST parameters:\e[0m"
        echo "  There are no BLAST default parameters, other than specifing XML output, the database and"
        echo "  queries as well as the numer of threads (all set by this script)."
        echo "  Detailed overview on parameters can be shown by entering <blastexecutable> -help"
        echo -e "\e[3m- Limitations:\e[0m"
        echo "  - Currently only NCBI-BLAST+ executables are supported."
        echo "  - References may be as big as 30G, 50G may also work."
        echo "    References beyond this this may or not work."
        echo "    Be sure to reserve sufficient RAM."
Christian Meesters's avatar
Christian Meesters committed
171 172 173 174 175 176 177 178 179 180
        echo 
        echo "Planned features for upcoming versions:"
        echo "- better, more stable user interface"
        echo "- support for alternate implementations, e.g. BLAT, diamond"
        echo "- automatic merging of xml output"
        echo "- restart capability to resume work in case of timelimits."
        echo "- beeond support for handling greater references"
        echo "- automatic reference database generation in case of version mismatches or similar"
        echo 
        echo "Minor improvements will be implemented as time permits. Yet, to make this possible,"
Christian Meesters's avatar
Christian Meesters committed
181
        echo -e "\e[1myou can request a collaboration\e[0m (for minor add ons an acknowladegement by name will do)."
182 183
}

Christian Meesters's avatar
Christian Meesters committed
184 185 186 187
credits()
{
        echo "The original implementation (2013/2014) was written by Christoph Martin (ZDV, UNIX group)"
        echo "Benjamin Rieger (Institut für Molekulargenetik) contributed a perl implementation"
188
        echo "of a format conform splitting of FASTA files, which is not used, anymore."
Christian Meesters's avatar
Christian Meesters committed
189
        echo "The original implemenation was a LSF chain job. It was eventually adopted and maintained by"
190
        echo "Christian Meesters (ZDV, HPC group) from 2017 onwards."
Christian Meesters's avatar
Christian Meesters committed
191
        echo 
192 193 194 195
        echo "I am particularly grateful for their feedback to:"
        echo "- Lukas Hellman (AG Hankeln)"
        echo "- Benjamin Rieger (NGS Facility)"
        echo 
Christian Meesters's avatar
Christian Meesters committed
196
        echo "History of the re-implementation:"
197 198 199 200
        echo "- v0.1   -- 27. Sep. 2017 -- release of the re-implementation for SLURM supporting the"
        echo "                             ability to compute accross nodes."
        echo "- v0.1.1 -- 19. Oct. 2017 -- bug fix: blast parameters now correctly transferred to"
        echo "                             blast(x)."
Christian Meesters's avatar
Christian Meesters committed
201
        echo "- v0.1.2 -- 25. Oct. 2017 -- minor fixes and clarifications"
202
        echo "- v0.2.b -- 18. Jan. 2018 -- new features:"
203 204 205
        echo "                             - possible to choose between NCBI-BLAST executables"
        echo "                             - better user feedback / self documentation \(at least a start\)"
        echo "                             - error messages are now directed to stderr"
206
        echo "                             - jobs may use reservations (e.g. for courses)"
207 208 209
        echo "                             changes:"
        echo "                             - deleted default evalue and max_target_seqs settings"
        echo "                             - XML output is mandatory"
210 211
        echo "                             fix:"
        echo "                             - introduced missing --time option"
212
        echo "                             - changed the blast parameter interface"
Christian Meesters's avatar
Christian Meesters committed
213
        echo "- v0.3   -- 21. Feb. 2018 -- faster merging of output files, parallel zipping"
214 215 216 217
        echo "- v0.3.1 -- 15. May  2018 -- bugfixes in handling the blast executables"
        echo "- v0.3.2 -- 16. Jan. 2019 -- hot fix for new ramdisk and slurmstepd support"
        echo "- v0.4   -- 06. Mar. 2019 -- refactored version:"
        echo "                             - executables now pluggable"
218
        echo "- v0.5   -- 21. Aug. 2019 -- fix: parser did not work for '--mem'-arg properly"
219 220 221 222 223
        echo "                             update: - clearer UI"
        echo "                                     - better default memory settings"
        echo "                                     - faster stage-in for reference data"
        echo "                                     - automerge for -outfmt=6"
        echo "                                     - -outfmt=6 is now the default"
224 225 226 227
        echo "- v0.5.1 -- 29. Aug. 2019 -- numerous housekeeping fixes"
        echo "- v0.5.2 -- 02. Sep. 2019 -- fix:"
        echo "                             - consistent biopython inclusion"
        echo "                             - auto-detection of database size and memory selection"
228 229
        echo 
        echo "Current version is: $SCRIPT_VERSION"
Christian Meesters's avatar
Christian Meesters committed
230 231 232 233 234
        echo
        echo "The re-implementation supporting parallel BLAST+ jobs accross several compute nodes has been"
        echo "written and is maintained by Christian Meesters (ZDV, HPC group)."
        echo "A (personal) acknowledgement is welcomed. Please refer to:"
        echo "https://hpc.uni-mainz.de/high-performance-computing/publikationen/"
235
        exit
Christian Meesters's avatar
Christian Meesters committed
236 237
}

238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306
# command line parsing
while test $# -gt 0
do
        _key="$1"
        case "$_key" in
                -l*|--runlimit|--runlimit=*)
                        _val_from_long="${_key##--runlimit=}"
                        _val_from_short="${_key##-l}"
                        if test "$_val_from_long" != "$_key"
                        then
                                _val="$_val_from_long"
                        elif test "$_val_from_short" != "$_key" -a -n "$_val_from_short"
                        then
                                _val="$_val_from_short"
                        else
                                test $# -lt 2 && die "Missing value for the optional argument '$_key'." 1
                                _val="$2"
                                shift
                        fi
                        _arg_runlimit="$_val"
                        ;;
                -p*|--partition|--partition=*)
                        _val_from_long="${_key##--partition=}"
                        _val_from_short="${_key##-p}"
                        if test "$_val_from_long" != "$_key"
                        then
                                _val="$_val_from_long"
                        elif test "$_val_from_short" != "$_key" -a -n "$_val_from_short"
                        then
                                _val="$_val_from_short"
                        else
                                test $# -lt 2 && die "Missing value for the optional argument '$_key'." 1
                                _val="$2"
                                shift
                        fi
                        _arg_queue="$_val"
                        ;;
                -A*|--account|--account=*)
                        _val_from_long="${_key##--account=}"
                        _val_from_short="${_key##-A}"
                        if test "$_val_from_long" != "$_key"
                        then
                                _val="$_val_from_long"
                        elif test "$_val_from_short" != "$_key" -a -n "$_val_from_short"
                        then
                                _val="$_val_from_short"
                        else
                                test $# -lt 2 && die "Missing value for the optional argument '$_key'." 1
                                _val="$2"
                                shift
                        fi
                        _arg_assoc="$_val"
                        ;;
                -N*|--nodes|--nodes=*)
                        _val_from_long="${_key##--nodes=}"
                        _val_from_short="${_key##-N}"
                        if test "$_val_from_long" != "$_key"
                        then
                                _val="$_val_from_long"
                        elif test "$_val_from_short" != "$_key" -a -n "$_val_from_short"
                        then
                                _val="$_val_from_short"
                        else
                                test $# -lt 2 && die "Missing value for the optional argument '$_key'." 1
                                _val="$2"
                                shift
                        fi
                        _arg_nodes="$_val"
                        ;;
307 308 309 310 311 312 313 314 315 316
                --reservation|--reservation=*)
                        _val="${_key##--reservation=}"
                        if test "$_val" = "$_key"
                        then
                                test $# -lt 2 && die "Missing value for the optional argument '$_key'." 1
                                _val="$2"
                                shift
                        fi
                        _arg_reservation="$_val"
                        ;;
317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332
                --time|--time=*)
                        _val_from_long="${_key##--time=}"
                        _val_from_short="${_key##-N}"
                        if test "$_val_from_long" != "$_key"
                        then
                                _val="$_val_from_long"
                        elif test "$_val_from_short" != "$_key" -a -n "$_val_from_short"
                        then
                                _val="$_val_from_short"
                        else
                                test $# -lt 2 && die "Missing value for the optional argument '$_key'." 1
                                _val="$2"
                                shift
                        fi
                        _arg_runlimit="$_val"
                        ;;
Christian Meesters's avatar
Christian Meesters committed
333 334
                -m*|--mem|--mem=*)
                        _val_from_long="${_key##--mem=}"
335 336 337 338 339 340 341 342
                        if test "$_val_from_long" != "$_key"
                        then
                                _val="$_val_from_long"
                        else
                                test $# -lt 2 && die "Missing value for the optional argument '$_key'." 1
                                _val="$2"
                                shift
                        fi
Christian Meesters's avatar
Christian Meesters committed
343
                        _arg_mem="$_val"
344
                        ;;
345 346 347 348 349 350 351 352 353 354 355 356
                --blastparams|--blastparams=*)
                        _val_from_long="${_key##--blastparams=}"
                        if test "$_val_from_long" != "$_key"
                        then
                                _val="$_val_from_long"
                        else
                                test $# -lt 2 && die "Missing value for the optional argument '$_key'." 1
                                _val="$2"
                                shift
                        fi
                        _arg_blastparams="$_val"
                        ;;
357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398
                -t*|--threads|--threads=*)
                        _val_from_long="${_key##--threads=}"
                        _val_from_short="${_key##-t}"
                        if test "$_val_from_long" != "$_key"
                        then
                                _val="$_val_from_long"
                        elif test "$_val_from_short" != "$_key" -a -n "$_val_from_short"
                        then
                                _val="$_val_from_short"
                        else
                                test $# -lt 2 && die "Missing value for the optional argument '$_key'." 1
                                _val="$2"
                                shift
                        fi
                        _arg_blast_threads="$_val"
                        ;;
                -s*|--splitup|--splitup=*)
                        _val_from_long="${_key##--splitup=}"
                        _val_from_short="${_key##-s}"
                        if test "$_val_from_long" != "$_key"
                        then
                                _val="$_val_from_long"
                        elif test "$_val_from_short" != "$_key" -a -n "$_val_from_short"
                        then
                                _val="$_val_from_short"
                        else
                                test $# -lt 2 && die "Missing value for the optional argument '$_key'." 1
                                _val="$2"
                                shift
                        fi
                        _arg_splitup_per_queryfile="$_val"
                        ;;
                --blastdir|--blastdir=*)
                        _val="${_key##--blastdir=}"
                        if test "$_val" = "$_key"
                        then
                                test $# -lt 2 && die "Missing value for the optional argument '$_key'." 1
                                _val="$2"
                                shift
                        fi
                        _arg_blastdir="$_val"
                        ;;
399 400 401 402 403 404 405 406 407 408
                --executable|--executable=*)
                        _val="${_key##--executable=}"
                        if test "$_val" = "$_key"
                        then
                                test $# -lt 2 && die "Missing value for the optional argument '$_key'." 1
                                _val="$2"
                                shift
                        fi
                        _arg_executable="$_val"
                        ;;
Christian Meesters's avatar
Christian Meesters committed
409 410 411 412
                --no-compress|--compress)
                        _arg_compress="on"
                        test "${1:0:5}" = "--no-" && _arg_compress="off"
                        ;;
413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441
                --no-test|--test)
                        _arg_test="on"
                        test "${1:0:5}" = "--no-" && _arg_test="off"
                        ;;
                --credits|--version)
                        credits
                        exit 0
                        ;;
                -h*|--help)
                        print_help
                        exit 0
                        ;;
                *)
                        _positionals+=("$1")
                        ;;
        esac
        shift
done

_positional_names=('_arg_fasta' '_arg_database' )
_required_args_string="'FASTA' and 'DATABASE'"
#test ${#_positionals[@]} -lt 2 && _PRINT_HELP=yes die "FATAL ERROR: Not enough positional arguments - we require exactly 2 (namely: $_required_args_string), but got only ${#_positionals[@]}." 1
#test ${#_positionals[@]} -gt 2 && _PRINT_HELP=yes die "FATAL ERROR: There were spurious positional arguments --- we expect exactly 2 (namely: $_required_args_string), but got ${#_positionals[@]} (the last one was: '${_positionals[*]: -1}')." 1
#for (( ii = 0; ii < ${#_positionals[@]}; ii++))
for (( ii = 0; ii < 2; ii++))
do
        eval "${_positional_names[ii]}=\${_positionals[ii]}" || die "Error during argument parsing, possibly an Argbash bug." 1
done
_our_args=$((${#_positionals[@]} - ${#_positional_names[@]}))
442 443
for (( ii = 0; ii < _our_args; ii++)); do
    _positional_names+=("_arg_leftovers[(($ii + 1))]")
444 445 446 447
done

### get query and database files
FASTA=$_arg_fasta
448
DATABASE=$(realpath $_arg_database)
449 450

### check if query & database exist
451
if [[ $_arg_test == "off" ]] && [ ! -e "$FASTA" ]; then
452
    error "FASTA input was: '$FASTA' - no such file!"
453 454 455
    exit 1
fi

456 457 458 459 460
if [[ $_arg_test == "off" ]] && [ ! -d "$DATABASE" ]; then
    error "DATABASE input was: '$DATABASE' - no such directory!"
    exit 1
fi

461 462 463
if [ "blastx" = "${_arg_executable,,}" ]; then
    executable="blastx"
    threads=2
464
    DBSUFFIX=".nal" # db suffix to be removed from nal file - not working, if nal file not present
465 466 467 468 469 470 471 472
    if [ -z "$SLURM_JOB_ID" ]; then
        source $(dirname "$0")/blast_wrap.sh
    else
        source "${SCRIPT_PATH}"/blast_wrap.sh
    fi
elif [ "blastn" = "${_arg_executable,,}" ]; then
    executable="blastn"
    threads=8
473
    #check_nucleotide_db
474 475 476 477 478 479 480 481
    if [ -z "$SLURM_JOB_ID" ]; then
        source $(dirname "$0")/blast_wrap.sh
    else
        source "${SCRIPT_PATH}"/blast_wrap.sh
    fi
elif [ "blastp" = "${_arg_executable,,}" ]; then
    executable="blastp"
    threads=2
482
    #check_protein_db
483 484 485 486 487 488 489 490 491 492
    if [ -z "$SLURM_JOB_ID" ]; then
        source $(dirname "$0")/blast_wrap.sh
    else
        source "${SCRIPT_PATH}"/blast_wrap.sh
    fi
else 
    error "executable '$_arg_executable' not recognized."
    print_help
    exit 2
fi   
493 494 495

### prepare filepath and -names for creating working folder
FASTAPATH=$(dirname $FASTA)
496
DATABASEPATH=$(realpath $DATABASE)
497 498 499 500 501 502 503 504 505

FASTAID=$(basename $FASTA)
DATABASEID=$(basename $DATABASE)

FA=${FASTAID%%.*}
DB=${DATABASEID%.*}

JOBTAG="BLAST_${FA}_VS_${DB}"

506 507 508 509 510 511 512 513 514 515 516
# how many entries are there in the FASTA file?
echo "Checking input file"
nentries=$(grep '>' $FASTA | wc -l)
# we try to set the split number to a value, which ensures an output of 
# ~ 10.000 split files
if [ $_arg_splitup_per_queryfile -ne 0 ]; then # the user thinks differently?
    nsplits=$((nentries / _arg_splitup_per_queryfile))
    if [ $nsplits -gt 50000 ]; then
        error "There would be more than '$nsplits' files in scratch."
        exit 1
    elif [ $nsplits -gt 15000 ]; then
517
        warning "There will be '$nsplits' files in scratch -- resulting in poor performance."
518 519
    fi
else # infer the value 
520
    _arg_splitup_per_queryfile=$((nentries / 5000))
521 522
fi

523 524
# default values, see:
#  https://www.ncbi.nlm.nih.gov/books/NBK279675/
525
DEFAULT_BLASTPARAMS='-outfmt 6'
526 527 528 529
# sanity check: '-outfmt' in blast parameters?
if [[ "$_arg_blastparams" =~ "outfmt" ]]; then
    BLASTPARAMS=$_arg_blastparams
else
Christian Meesters's avatar
Christian Meesters committed
530 531
    BLASTPARAMS="${_arg_blastparams} $DEFAULT_BLASTPARAMS"
fi
532 533

### testing for output options - to be used later 
Christian Meesters's avatar
Christian Meesters committed
534
# test whether the output is xml or not
535
if [[ '-outfmt 5' =~ $(echo -e "${BLASTPARAMS}" | sed -e 's/^[[:space:]]*//') ]]; then
Christian Meesters's avatar
Christian Meesters committed
536
    XMLOUT=1
537
    OUTOUT=0
538 539
# test whether the output is plain tabular or not
elif [[ '-outfmt 6' =~ $(echo -e "${BLASTPARAMS}" | sed -e 's/^[[:space:]]*//') ]]; then
Christian Meesters's avatar
Christian Meesters committed
540
    XMLOUT=0
541
    OUTOUT=1
542
fi
543

544
# TODO: port to M2
545 546 547 548 549 550 551 552 553 554 555 556 557 558
### Auto-Adjust the Queue
if [ $_arg_runlimit -le 300 ]; then
    if [ "$_arg_queue" != "nodeshort" ]; then
        _arg_queue="${_arg_queue/long/short}"
    fi
else
    if [ "$_arg_queue" != "nodeslong" ]; then
        _arg_queue="${_arg_queue/short/long}"
    fi
fi

### check if working directory already exists
if [ -z "$SLURM_JOB_ID" ] && [[ $_arg_test == "off" ]]; then
    if [ -d "$JOBTAG" ]; then
559
        echo "$JOBTAG : directory already exists! Remove? (y/[n])"
560 561 562
        echo -n '>'
        read ENTER
        
563
        if [[ ${ENTER,,} = 'y' ||  ${ENTER,,} == 'yes' ]]; then
564 565 566
            echo "removing directory $JOBTAG"
            rm -r $JOBTAG
        else
Christian Meesters's avatar
Christian Meesters committed
567
            echo "So, you want to continue regardless (using the existing scratch files)? ([y]/n)"
568 569
            echo -n '>'
            read ENTER
Christian Meesters's avatar
Christian Meesters committed
570
            if [[ ${ENTER,,} = 'n' || ${ENTER,,} == 'no' ]] ; then
571 572
                exit
            fi
573 574 575 576 577 578 579 580 581 582 583 584 585 586 587
        fi
    fi
fi

# we just keep the current working directory
PWD=$(pwd)

### give query and database absolute path if necessary
if [[ ! $FASTAPATH == /* ]]; then
    FASTAPATH="$PWD/$FASTAPATH";
fi

FASTA="$FASTAPATH/$FASTAID"

### setup blast and splitup executable; check if exist
588
allowed_executables="blastx blastp blastn"
Christian Meesters's avatar
Christian Meesters committed
589 590 591
if [[ ! $allowed_executables =~ (^| [[:space:]])"$_arg_executable"($| )  ]]; then
    BLASTEXE=$(which $_arg_executable)
else
592
    error "$_arg_executable ought to be one of [$allowed_executables]"
Christian Meesters's avatar
Christian Meesters committed
593
    exit 1
594
fi
Christian Meesters's avatar
Christian Meesters committed
595
export _arg_executable
596

597
### which is the reference directory size?
598
_arg_ramdisk=$(du -shL --block-size=1M "$_arg_database" | cut -f1 )
599
if [[ ! $SCRIPT == /* ]]; then
600
   SCRIPT="$PWD/$SCRIPT";
601 602
fi

603 604 605
# which cluster are we on?
cluster=$(sacctmgr show cluster -p| tail -n1| cut -f1 -d '|')
# if the cluster is Mogon I, set the memory default accordingly:
606
allowed_mem_setting=""
607
if [ "$cluster" == "mogon" ]; then
608 609 610 611 612
    allowed_mem_settings="115500 242500 497500"
    # add a savety measure (100 MB, each core)
    for setting in $allowed_mem_settings;  do
        if [ $((_arg_ramdisk + 6400 )) -lt $setting ]; then
            allowed_mem_setting="$allowed_mem_setting $setting"
613
        fi
614 615 616 617 618 619 620 621 622 623 624 625
    done
    # test whether there is any valid setting left
    if [ -z "$allowed_mem_setting" ]; then
        error "database > available + necessary RAM"
        exit 1
    fi
    # remove first space, if any
    allowed_mem_setting="${allowed_mem_setting/ /}"
    if [ $_arg_mem -ne 0 ]; then # user tries to select a non-default memory
        if [[ ! "$allowed_mem_setting" =~ (^|[[:space:]])"$_arg_mem"($|[[:space:]]) ]]; then
           error "Memory selection out to be one of [$allowed_mem_setting]"
           exit 1
626
        fi
627 628 629 630 631 632 633 634
    # select a default
    else 
        _memory_request=$(echo $allowed_mem_setting | cut -d" " -f1)M
        #if [ "$_arg_executable" == "blastn" ]; then
        #    _memory_request="242500M"
        #else
        #    _memory_request="115500M"
        #fi
635
    fi
636
else # to be implemented for MII
637 638 639 640 641 642 643 644 645 646 647 648 649
    if [ $_arg_mem -ne 0 ]; then # user tries to select a non-default memory
        allowed_mem_setting="115500 242500 497500"
        if [[ ! $allowed_mem_settings =~ (^|[[:space:]])"_arg_mem"($|[[:space:]]) ]]; then
            error "Memory selection out to be one of [$allowed_mem_settings]"
        fi
    else # set a default memory
        if [ "$_arg_executable" == "blastn" ]; then
            _memory_request="246000M"
        else
            _memory_request="120000M"
        fi
    fi
fi
650 651
# finally add another MB to the ramdisk to be save and the unit
_arg_ramdisk=$((_arg_ramdisk + 1024 ))M
652

653
### how many entries are there in the FASTA file?
654 655 656 657 658
nentries=$(grep '>' $FASTA | wc -l)
# we try to set the split number to a value, which ensures an output of 
# ~ 10.000 split files


659 660
### check if this script is on node by checking env-variable $SLURM_JOB_ID, else send it to SLURM with given parameters and exit
if [ -z "$SLURM_JOB_ID" ]; then
661
    export SCRIPT_PATH=$(dirname $0)
Christian Meesters's avatar
Christian Meesters committed
662
    submit_statement="sbatch -o ${JOBTAG}_%j.out -J $JOBTAG -p $_arg_queue -A $_arg_assoc -t $_arg_runlimit -N $_arg_nodes -n $((64 * $_arg_nodes / $threads)) --mem=$_memory_request --ramdisk=${_arg_ramdisk} -c $threads"
663 664 665 666 667 668 669 670

    script_statement="$SCRIPT --partition $_arg_queue --account $_arg_assoc --nodes $_arg_nodes --time $_arg_runlimit --reservation=$_arg_reservation  --threads $_arg_blast_threads --splitup $_arg_splitup_per_queryfile --blastparams=\"$BLASTPARAMS\" --executable=$_arg_executable $FASTA $DATABASE"
    if [ -n "$_arg_reservation" ]; then
        submit_statement="${submit_statement} --reservation=${_arg_reservation}"
    fi

    # paste them together
    submit_call="${submit_statement} ${script_statement}"
671 672 673 674 675
    if [[ $_arg_test == "on" ]]; then
        echo "Just testing - this command would be submitted:"
        echo $submit_call
        exit
    fi
676
    echo "sending job:"
677
    echo $submit_call
678
    eval $submit_call
679 680 681
    exit
fi

682 683 684 685 686 687 688 689 690 691 692
#### Self-Documentation
echo "You are using $0, version $SCRIPT_VERSION"
echo 
echo "Self-Documentation (on $(date))"
[[ $_arg_test == "on" ]] && echo "  - This is a test-run, only"
echo "  - The query input is '$FASTA'"
echo "  - The database is '$DATABASE'"
BLASTVERSION=$($BLASTEXE -version | head -n1 | cut -f2 -d' ')
echo "  - The executable is '$BLASTEXE', version: $BLASTVERSION"
echo "  - Parameters to your call are: $BLASTPARAMS"

693 694 695 696 697 698 699 700 701
### set variables for lokal (node side) directories (working & ramdisk)
JOBDIR=/localscratch/$SLURM_JOB_ID
RAMDISK=$JOBDIR/ramdisk

# copy DB onto ramdisk
# TODO: change to sbcast, once available
HOSTLIST=$(scontrol show hostname $SLURM_JOB_NODELIST | paste -d, -s | tr ',', ' ')
QUEUE=''

Christian Meesters's avatar
Christian Meesters committed
702 703 704 705 706 707 708 709 710 711 712 713
#myhost=$(hostname -f)
stagefile=/localscratch/$SLURM_JOB_ID/dummy_stagein.sh
rstagefile=/localscratch/$SLURM_JOB_ID/stagein.sh
source "${SCRIPT_PATH}"/stage_in.sh
stage_in_writer
chmod +x $stagefile

# distribute the stagewriter
sbcast $stagefile $rstagefile

rm $stagefile
stagefile=$rstagefile
714

Christian Meesters's avatar
Christian Meesters committed
715 716
# we would not need this loop with regard to slurm, but as we have
# asynchronous tasks already, we keep track with the queue
717
for HOST in $HOSTLIST; do
Christian Meesters's avatar
Christian Meesters committed
718 719
   srun -w $HOST -N1 -n1 -c1 --mem-per-cpu=5000M $stagefile &
   queue $!
720 721 722
done

WORKDIR=$PWD/$BLASTDIR/$SLURM_JOB_NAME
723 724
# this script may never output to a user's $HOME
if [[ *"$WORKDIR"* = 'home' ]]; then
Christian Meesters's avatar
Christian Meesters committed
725
    error "Cowardly refusing to operate in a home directory."
726
fi
727 728 729 730 731 732 733
# set path names to ease maintance
SPLITFILEDIR=scratch

### check if exists, if no -> firsttime run, create subdirs and enter, split query file, else just enter
if [ ! -d "$WORKDIR/$SPLITFILEDIR" ]; then
    mkdir -p "$WORKDIR/$SPLITFILEDIR" || exit 1;
    mkdir -p "$WORKDIR/output" || exit 1;
734
    cd "$WORKDIR"
735
    echo "executing scratch generator on $FASTA ($_arg_splitup_per_queryfile entries per file)"
736
    eval "${SCRIPT_PATH}/splitter.py $FASTA $_arg_splitup_per_queryfile " &     # splitup queryfile
Christian Meesters's avatar
Christian Meesters committed
737
    queue $!
738 739
fi

740 741 742 743 744 745
# wait until the copy and a possible scratch generation are finished
while  [[ ! -z "$(echo $QUEUE| tr -d ' ')" ]]; do
   checkqueue
   sleep 5
done

746 747 748 749 750 751
DATABASE=$(find $RAMDISK -type f -print -quit)
# strip suffix - if more than one dot
while [ $(grep -o "\." <<< $DATABASE | wc -l) -gt 1 ]; do
   DATABASE=${DATABASE%.*}
done

Christian Meesters's avatar
Christian Meesters committed
752 753 754 755 756
if [[ -z $DATABASE ]]; then
    error "Unable to recognize database, please get in touch with hpc@uni-mainz.de"
    exit 1
fi

757 758 759 760 761 762 763 764
cd "$WORKDIR"

# calculating the degree of parallelism is necessary in order not to oversaturate with srun processes.
# As $SLURM_CPUS_PER_TASK can be unset, we need to set it in those cases
if [[ -z "$SLURM_CPUS_PER_TASK" ]]; then
   declare -i SLURM_CPUS_PER_TASK=1
fi

765 766 767 768 769 770 771 772 773 774 775 776
### a temporary script to conduct the alignment
cmdfile=/localscratch/$SLURM_JOB_ID/dummy.sh
cmdfilewriter

chmod +x $cmdfile

newcmd=/localscratch/$SLURM_JOBID/dummy_wrapper.sh
sbcast $cmdfile $newcmd

rm $cmdfile
cmdfile=$newcmd

777
samples=$(find $(pwd) -type f -name 'group*.fasta')
778 779
### append a finishing token to the samples
samples+=('done')
Christian Meesters's avatar
Christian Meesters committed
780 781 782

parallel="parallel --no-notice -j $SLURM_NTASKS -P $SLURM_NTASKS "

783
srun="srun --cpu-bind=q --mem-bind=q -n 1 -N1 --exclusive -c $SLURM_CPUS_PER_TASK --jobid $SLURM_JOBID --mem-per-cpu=$((SLURM_MEM_PER_NODE / SLURM_CPUS_ON_NODE))"
784
$parallel "$srun" "$cmdfile" ::: $samples
785

786 787
echo before wait
echo  $n_unfinished_files  $_arg_compress $OUTOUT 
788
wait
789 790
echo after wait
echo  $n_unfinished_files  $_arg_compress $OUTOUT 
791

792
n_unfinished_files=$(comm -3 <(cd output && find .| grep -o '[0-9]*' |sort ) <(cd scratch && find . | grep -o '[0-9]*' |sort )|wc -l)
793 794
echo after setting
echo  $n_unfinished_files  $_arg_compress $OUTOUT 
Christian Meesters's avatar
Christian Meesters committed
795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811
if [ $n_unfinished_files -eq 0 ] && [[ $_arg_compress == "on" ]] && [ $XMLOUT -eq 1 ]; then
    # shrink the alloction, such that only the minimum necessary is accounted for
    #scontrol update job=$SLURM_JOB_ID NumNodes=1  
    pwd
    # merge all xml files
    STARTC=$(date +%s.%N)
    outfile="${JOBTAG}.xml"
    # select the first of all files
    some_file=$(find ./output -name 'group*' | head -n1)
    zcat $some_file | head -n21 > $outfile
    # extract header information and write to outfile
    for split_file in ./output/group_*.xml.gz; do
        zcat $split_file | tail -n+22  |head -n-3 >> $outfile
    done
    # extract footer information and write to outfile
    zcat $some_file | tail -n3 >> $outfile
    pigz -p 16 $outfile &
812
    rm -rf $WORKDIR/$SPLITFILEDIR &
813
    rm ./output/group_*.xml &
814
    wait
815 816 817
    ENDC=$(date +%s.%N)
    elapsedc=$(bc <<< "scale=1; (($ENDC-$STARTC))/60")
# merge all standard output files (outfmt -6 -- tabular output) files
818 819 820 821 822 823
elif [ $n_unfinished_files -eq 0 ] && [[ $_arg_compress == "on" ]] && [ $OUTOUT -eq 1 ]; then
    STARTC=$(date +%s.%N)
    outfile="${JOBTAG}.out"
    # write anything to the output file
    for split_file in ./output/group_*gz; do
        zcat $split_file >> $outfile
824
        rm $split_file
825 826 827
    done
    pigz -p 16 $outfile &
    rm -rf $WORKDIR/$SPLITFILEDIR &
828
    rmdir ./output &
829
    wait
830 831
    ENDC=$(date +%s.%N)
    elapsedc=$(bc <<< "scale=1; (($ENDC-$STARTC))/60")
832
fi
Christian Meesters's avatar
Christian Meesters committed
833 834 835 836 837

# marks the end of this run
END=$(date +%s.%N)
elapsed=$(bc <<< "scale=1; (($END-$START))/60")

838
echo "parallel_BLAST took $elapsed minutes to run; the compression took $elapsedc minutes"
839
    
840 841
# TODO: Check: 1 output item per input scratch file?
# TODO: If not re-submit with correct/adjusted job size