Commit f9b1e397 authored by Christian Meesters's avatar Christian Meesters
Browse files

- outmerging for out files

- improved UI
- minor debugging
parent 5ed2df07
......@@ -43,15 +43,11 @@ module purge
# load the most current version of GNU parallel
module load tools/parallel
# load the most current version of BLAST +
#module load bio/BLAST+ # do not rely on most recent version
module load bio/BLAST+/2.9.0-foss-2018a
module load lang/Python/3.6.4-foss-2018a
### setup variable for THIS script; giving absolute path if necessary
SCRIPT="$0"
SCRIPT_VERSION="0.4"
SCRIPT_VERSION="0.5"
# TODO: delete the following 3 functions, once sbcast is working
function queue {
......@@ -102,6 +98,10 @@ error() {
(>&2 echo "ERROR: $1")
}
warning() {
(>&2 echo "WARNING: $1")
}
# THE DEFAULTS INITIALIZATION - POSITIONALS
_positionals=()
_arg_leftovers=()
......@@ -112,10 +112,11 @@ _arg_queue=nodeshort
_arg_assoc=$(sacct -nu $USER -o Account | tail -n1)
declare -i _arg_nodes=1
_arg_reservation=''
declare _arg_mem=1G
declare _memory_request=115500M
declare _arg_mem=0
declare -i _arg_blast_threads=1
_arg_blast_params=''
declare -i _arg_splitup_per_queryfile=20
declare -i _arg_splitup_per_queryfile=0
declare _arg_ramdisk=40G
_arg_blastdir='.'
_arg_executable='blastx'
......@@ -126,26 +127,26 @@ print_help ()
{
echo "This script's help msg"
printf 'Usage: %s [-l|--runlimit <arg>] [-p|--partition <arg>] [-s|--splitup <arg>] [-N|--nodes <arg>] [--executable <arg>] [-m|--mem <arg>] [--blastparams <string>] [-r|--ramdisk <arg>] [--blastdir <arg>] [--(no-)test] [-h|--help] <FASTA> <DATABASE>\n' "$(basename $0)\n"
printf 'HINT: The FASTA and DATABASE items need to be full paths to files.'
printf "\t%s\n" "<FASTA>: path to the query FASTA file"
printf "\t%s\n" "<DATABASE>: path to the database file"
printf "\t%s\n" "-l,--runlimit: runlimit default is 300 min, queue will be nodeshort, if <= 300 (default)"
printf "\t%s\n" "-p,--partition: queue (default is nodeshort)"
printf "\t%s\n" "-A,--account: queue (default is the last submit account; an error is triggered if none specified nor can be deduced)"
printf "\t%s\n" "-N,--nodes: number of nodes (1 is the default)"
printf "\t%s\n" "--reservation: reservation to use (none is the default)"
printf "\t%s\n" "--time: time in minutes (300 is the default)"
printf "\t%s\n" "-m,--mem: memory which is required per node (defaults to 115500 M, but should be min. 242500 M for blastn)"
printf "\t%s\n" "-r,--ramdisk: ramdisk size in units of GiB (default is 40 GiB)"
printf "\t%s\n" "-t,--threads: blast threads (default is 1)"
printf "\t%s\n" "--blastparams: blast parameters (default is -outfmt 5 (for xml output))"
printf "\t%s\n" "-s,--splitup: No. of FASTA sequences per query file (default is 20)"
printf "\t%s\n" "--blastdir: output directory (default is composition of input names)"
printf "\t%s\n" "--executable: choose executable (currently only from NCBI-BLAST, default: blastx)"
printf "\t%s\n" "--compress: if set, the output files will be merged and compressed (time consuming!, defaultt: off)"
printf "\t%s\n" "--test,--no-test: dry run, testing only (off by default)"
printf "\t%s\n" "--credits,--version: Prints credits and a brief version history and exits"
printf "\t%s\n" "-h,--help: Prints help"
printf 'HINT: The FASTA and DATABASE items need to be full paths to files.\n'
printf "\\t\\033[1m%s\\033[0m\\t\\t%s\\n" "<FASTA>" "path to the query FASTA file"
printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "<DATABASE>" "path to the database file"
printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "-l,--runlimit" "runlimit default is 300 min, queue will be nodeshort, if <= 300 (default)"
printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "-p,--partition" "queue (default is nodeshort)"
printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "-A,--account" "SLURM account (default is the last submit account; an error is triggered if none specified nor can be deduced)"
printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "-N,--nodes" "number of nodes (1 is the default)"
printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "--reservation" "reservation to use (none is the default)"
printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "--time" "time in minutes (300 is the default)"
printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "-m,--mem" "memory which is required per node (defaults to 115500 M, but should be min. 242500 M for blastn, omit the unit for submitting)"
printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "-r,--ramdisk" "ramdisk size in units of GiB (default is 40 GiB)"
printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "-t,--threads" "blast threads (default is 1)"
printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "--blastparams" "blast parameters (default is -outfmt 6 (for blank tabulated output))"
printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "-s,--splitup" "No. of FASTA sequences per query file (default is 20)"
printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "--blastdir" "output directory (default is composition of input names)"
printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "--executable" "choose executable (currently only from NCBI-BLAST, default: blastx)"
printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "--compress" "if set, the output files will be merged and compressed (time consuming!, defaultt: off)"
printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "--test,--no-test" "dry run, testing only (off by default)"
printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "--credits,--version" "Prints credits and a brief version history and exits"
printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "-h,--help" "Prints help"
echo
echo -e "\e[3mWARNINGS:\e[0m"
echo -e "\e[3m- BLAST parameters:\e[0m"
......@@ -200,6 +201,12 @@ credits()
echo "- v0.3.2 -- 16. Jan. 2019 -- hot fix for new ramdisk and slurmstepd support"
echo "- v0.4 -- 06. Mar. 2019 -- refactored version:"
echo " - executables now pluggable"
echo "- v0.5 -- 17. Aug. 2019 -- fix: parser did not work for '--mem'-arg properly"
echo " update: - clearer UI"
echo " - better default memory settings"
echo " - faster stage-in for reference data"
echo " - automerge for -outfmt=6"
echo " - -outfmt=6 is now the default"
echo
echo "Current version is: $SCRIPT_VERSION"
echo
......@@ -423,11 +430,16 @@ FASTA=$_arg_fasta
DATABASE=$_arg_database
### check if query & database exist
if [ ! -e "$FASTA" ]; then
if [[ $_arg_test == "off" ]] && [ ! -e "$FASTA" ]; then
error "FASTA input was: '$FASTA' - no such file!"
exit 1
fi
if [[ $_arg_test == "off" ]] && [ ! -d "$DATABASE" ]; then
error "DATABASE input was: '$DATABASE' - no such directory!"
exit 1
fi
#TODO: differentiate between blastn,x,p -- for now, all are equal
if [ "blastx" = "${_arg_executable,,}" ]; then
executable="blastx"
......@@ -471,9 +483,26 @@ DB=${DATABASEID%.*}
JOBTAG="BLAST_${FA}_VS_${DB}"
# how many entries are there in the FASTA file?
echo "Checking input file"
nentries=$(grep '>' $FASTA | wc -l)
# we try to set the split number to a value, which ensures an output of
# ~ 10.000 split files
if [ $_arg_splitup_per_queryfile -ne 0 ]; then # the user thinks differently?
nsplits=$((nentries / _arg_splitup_per_queryfile))
if [ $nsplits -gt 50000 ]; then
error "There would be more than '$nsplits' files in scratch."
exit 1
elif [ $nsplits -gt 15000 ]; then
warning "There will be '$nsplits' files in scratch."
fi
else # infer the value
_arg_splitup_per_queryfile=$((nentries / 10000))
fi
# default values, see:
# https://www.ncbi.nlm.nih.gov/books/NBK279675/
DEFAULT_BLASTPARAMS='-outfmt 5'
DEFAULT_BLASTPARAMS='-outfmt 6'
# sanity check: '-outfmt' in blast parameters?
if [[ "$_arg_blastparams" =~ "outfmt" ]]; then
BLASTPARAMS=$_arg_blastparams
......@@ -483,8 +512,10 @@ fi
# test whether the output is xml or not
if [[ '-outfmt 5' =~ "$BLASTPARAMS" ]]; then
XMLOUT=1
else
OUTOUT=0
elif [[ '-outfmt 6' =~ "$BLASTPARAMS" ]]; then
XMLOUT=0
OUTOUT=1
fi
# TODO: port to M2
......@@ -550,15 +581,52 @@ fi
### which is the reference directory size?
_arg_ramdisk=$(du -shL --block-size=1M "$_arg_database" | cut -f1 )M
if [[ ! $SCRIPT == /* ]]; then
SCRIPT="$PWD/$SCRIPT";
fi
# which cluster are we on?
cluster=$(sacctmgr show cluster -p| tail -n1| cut -f1 -d '|')
# if the cluster is Mogon I, set the memory default accordingly:
if [ "$cluster" == "mogon" ]; then
if [ $_arg_mem -ne 0 ]; then # user tries to select a non-default memory
allowed_mem_setting="115500 242500 497500"
if [[ ! $allowed_mem_settings =~ (^|[[:space:]])"_arg_mem"($|[[:space:]]) ]]; then
error "Memory selection out to be one of [$allowed_mem_settings]"
fi
else # set a default memory
if [ "$_arg_executable" == "blastn" ]; then
_memory_request="242500M"
else
_memory_request="115500M"
fi
fi
else
if [ $_arg_mem -ne 0 ]; then # user tries to select a non-default memory
allowed_mem_setting="115500 242500 497500"
if [[ ! $allowed_mem_settings =~ (^|[[:space:]])"_arg_mem"($|[[:space:]]) ]]; then
error "Memory selection out to be one of [$allowed_mem_settings]"
fi
else # set a default memory
if [ "$_arg_executable" == "blastn" ]; then
_memory_request="246000M"
else
_memory_request="120000M"
fi
fi
fi
# how many entries are there in the FASTA file?
nentries=$(grep '>' $FASTA | wc -l)
# we try to set the split number to a value, which ensures an output of
# ~ 10.000 split files
### check if this script is on node by checking env-variable $SLURM_JOB_ID, else send it to SLURM with given parameters and exit
if [ -z "$SLURM_JOB_ID" ]; then
export SCRIPT_PATH=$(dirname $0)
submit_statement="sbatch --no-requeue -o ${JOBTAG}_%j.out -J $JOBTAG -p $_arg_queue -A $_arg_assoc -t $_arg_runlimit -N $_arg_nodes -n $((64 * $_arg_nodes / $threads)) --mem=$_arg_mem --ramdisk=${_arg_ramdisk} -c $threads"
submit_statement="sbatch --no-requeue -o ${JOBTAG}_%j.out -J $JOBTAG -p $_arg_queue -A $_arg_assoc -t $_arg_runlimit -N $_arg_nodes -n $((64 * $_arg_nodes / $threads)) --mem=$_memory_request --ramdisk=${_arg_ramdisk} -c $threads"
script_statement="$SCRIPT --partition $_arg_queue --account $_arg_assoc --nodes $_arg_nodes --time $_arg_runlimit --reservation=$_arg_reservation --threads $_arg_blast_threads --splitup $_arg_splitup_per_queryfile --blastparams=\"$BLASTPARAMS\" --executable=$_arg_executable $FASTA $DATABASE"
if [ -n "$_arg_reservation" ]; then
......@@ -598,18 +666,35 @@ RAMDISK=$JOBDIR/ramdisk
HOSTLIST=$(scontrol show hostname $SLURM_JOB_NODELIST | paste -d, -s | tr ',', ' ')
QUEUE=''
for HOST in $HOSTLIST; do
# when copying dereference putative links!
eval "ssh $HOST cp -Lr $DATABASEPATH $RAMDISK/. &"
PID=$!
queue $PID
# outcommented because of bug in slurm 16.05, see TODO-item
if [ -L ${DATABASEPATH} ]; then
warning "If the reference directory is a link, fast stage-in is not possible."
for fname in ${DATABASEPATH}/*; do
eval "ssh $HOST cp -L $fname ${RAMDISK}/$(basename $fname)" &
PID=$!
queue $PID
done
else
for fname in ${DATABASEPATH}/*; do
if [ -L "$fname" ]; then
eval "ssh $HOST cp -L $fname ${RAMDISK}/$(basename $fname)" &
PID=$!
queue $PID
else
eval "ssh $HOST dd bs=4096 if=$fname of=${RAMDISK}/$(basename $fname)" &
PID=$!
queue $PID
fi
done
fi
# TODO: check for dereferencing links, before enabling
# TODO: check for performance, before re-enabling
#sbcast $FILE $RAMDISK/$(basename $FILE)
done
#DATABASE=$RAMDISK/$DATABASE
DATABASE=$RAMDISK/$(basename $DATABASEPATH)
DATABASE=$RAMDISK #/$(basename $DATABASEPATH)
WORKDIR=$PWD/$BLASTDIR/$SLURM_JOB_NAME
# this script may never output to a user's $HOME
......@@ -700,6 +785,24 @@ if [ $n_unfinished_files -eq 0 ] && [[ $_arg_compress == "on" ]] && [ $XMLOUT -e
#rm ./output/group_*.xml &
#rm -rf ./scratch &
wait
elif [ $n_unfinished_files -eq 0 ] && [[ $_arg_compress == "on" ]] && [ $OUTOUT -eq 1 ]; then
# shrink the alloction, such that only the minimum necessary is accounted for
#scontrol update job=$SLURM_JOB_ID NumNodes=1
pwd
# merge all xml files
STARTC=$(date +%s.%N)
outfile="${JOBTAG}.out"
# select the first of all files
some_file=$(find ./output -name 'group*' | head -n1)
# write anything to the output file
for split_file in ./output/group_*gz; do
zcat $split_file >> $outfile
done
pigz -p 16 $outfile &
ENDC=$(date +%s.%N)
elapsedc=$(bc <<< "scale=1; (($ENDC-$STARTC))/60")
rm -rf $WORKDIR/$SPLITFILEDIR &
wait
fi
# marks the end of this run
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment