Commit 1d6458be authored by Christian Meesters's avatar Christian Meesters

Merge branch 'devel' into 'master'

Devel

Closes #30

See merge request !18
parents 5509f2df cd18464c
......@@ -12,14 +12,16 @@ easyblock = 'Binary'
sources = ['./parallel_BLAST/LA_Wrapper',
'./parallel_BLAST/cleanup.sh',
'./parallel_BLAST/blast_wrap.sh',
'./parallel_BLAST/splitter.py']
'./parallel_BLAST/splitter.py',
'./parallel_BLAST/stage_in.sh']
unpack_sources = False
files_to_copy = ['LA_Wrapper',
'cleanup.sh',
'blast_wrap.sh',
'splitter.py']
'splitter.py',
'stage_in.sh']
postinstallcmds = ['mv %(installdir)s/parallel_BLAST/* %(installdir)s && rmdir %(installdir)s/parallel_BLAST']
......
......@@ -43,8 +43,7 @@ module purge
# load the most current version of GNU parallel
module load tools/parallel/20190822
#module load lang/Python/3.6.4-foss-2018a
module load lang/Python/3.7.4-GCCcore-8.3.0
module load bio/Biopython/1.74-foss-2019a
#TODO: find a solution for the bug in BLAST+ AND to select the version by hand
module load bio/BLAST+/2.9.0-gompi-2019a
#module load bio/BLAST+/2.7.1-foss-2018a
......@@ -53,7 +52,7 @@ module load bio/BLAST+/2.9.0-gompi-2019a
### setup variable for THIS script; giving absolute path if necessary
SCRIPT="$0"
SCRIPT_VERSION="0.5"
SCRIPT_VERSION="0.5.3"
# TODO: delete the following 3 functions, once sbcast is working
function queue {
......@@ -99,6 +98,14 @@ begins_with_short_option()
test "$all_short_options" = "${all_short_options/$first_option/}" && return 1 || return 0
}
check_nucleotide_db() {
echo "check not yet implemented"
}
check_protein_db() {
echo "check not yet implemented"
}
# function to redirect simple error messages to stderr
error() {
(>&2 echo "ERROR: $1")
......@@ -128,11 +135,12 @@ _arg_blastdir='.'
_arg_executable='blastx'
_arg_test=off
_arg_compress=on
_arg_debug=off
print_help ()
{
echo "This script's help msg"
printf 'Usage: %s [-l|--runlimit <arg>] [-p|--partition <arg>] [-s|--splitup <arg>] [-N|--nodes <arg>] [--executable <arg>] [-m|--mem <arg>] [--blastparams <string>] [-r|--ramdisk <arg>] [--blastdir <arg>] [--(no-)test] [-h|--help] <FASTA> <DATABASE>\n' "$(basename $0)\n"
printf 'Usage: %s [-l|--runlimit <arg>] [-p|--partition <arg>] [-N|--nodes <arg>] [--executable <arg>] [--blastparams <string>] [-r|--ramdisk <arg>] [--blastdir <arg>] [--(no-)test] [-h|--help] <FASTA> <DATABASE>\n' "$(basename $0)\n"
printf 'HINT: The FASTA and DATABASE items need to be full paths to files.\n'
printf "\\t\\033[1m%s\\033[0m\\t\\t%s\\n" "<FASTA>" "path to the query FASTA file"
printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "<DATABASE>" "path to the database file"
......@@ -142,11 +150,8 @@ print_help ()
printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "-N,--nodes" "number of nodes (1 is the default)"
printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "--reservation" "reservation to use (none is the default)"
printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "--time" "time in minutes (300 is the default)"
printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "-m,--mem" "memory which is required per node (defaults to 115500 M, but should be min. 242500 M for blastn, omit the unit for submitting)"
printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "-r,--ramdisk" "ramdisk size in units of GiB (default is 40 GiB)"
printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "-t,--threads" "blast threads (default is 1)"
printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "--blastparams" "blast parameters (default is -outfmt 6 (for blank tabulated output))"
printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "-s,--splitup" "No. of FASTA sequences per query file (default is to generate ~5000 files)"
printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "--blastdir" "output directory (default is composition of input names)"
printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "--executable" "choose executable (currently only from NCBI-BLAST, default: blastx)"
printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "--compress" "if set, the output files will be merged and compressed (time consuming!, defaultt: off)"
......@@ -217,6 +222,11 @@ credits()
echo " - faster stage-in for reference data"
echo " - automerge for -outfmt=6"
echo " - -outfmt=6 is now the default"
echo "- v0.5.1 -- 29. Aug. 2019 -- numerous housekeeping fixes"
echo "- v0.5.2 -- 02. Sep. 2019 -- fix:"
echo " - consistent biopython inclusion"
echo " - auto-detection of database size and memory selection"
echo "- v0.5.3 -- 24. Sep. 2019 -- fix: dereferencing of reference / database files re-enabled"
echo
echo "Current version is: $SCRIPT_VERSION"
echo
......@@ -406,6 +416,10 @@ do
_arg_test="on"
test "${1:0:5}" = "--no-" && _arg_test="off"
;;
--debug)
_arg_debug="on"
set -x
;;
--credits|--version)
credits
exit 0
......@@ -437,7 +451,7 @@ done
### get query and database files
FASTA=$_arg_fasta
DATABASE=$_arg_database
DATABASE=$(realpath $_arg_database)
### check if query & database exist
if [[ $_arg_test == "off" ]] && [ ! -e "$FASTA" ]; then
......@@ -453,6 +467,7 @@ fi
if [ "blastx" = "${_arg_executable,,}" ]; then
executable="blastx"
threads=2
DBSUFFIX=".nal" # db suffix to be removed from nal file - not working, if nal file not present
if [ -z "$SLURM_JOB_ID" ]; then
source $(dirname "$0")/blast_wrap.sh
else
......@@ -461,6 +476,7 @@ if [ "blastx" = "${_arg_executable,,}" ]; then
elif [ "blastn" = "${_arg_executable,,}" ]; then
executable="blastn"
threads=8
#check_nucleotide_db
if [ -z "$SLURM_JOB_ID" ]; then
source $(dirname "$0")/blast_wrap.sh
else
......@@ -469,6 +485,7 @@ elif [ "blastn" = "${_arg_executable,,}" ]; then
elif [ "blastp" = "${_arg_executable,,}" ]; then
executable="blastp"
threads=2
#check_protein_db
if [ -z "$SLURM_JOB_ID" ]; then
source $(dirname "$0")/blast_wrap.sh
else
......@@ -518,11 +535,14 @@ if [[ "$_arg_blastparams" =~ "outfmt" ]]; then
else
BLASTPARAMS="${_arg_blastparams} $DEFAULT_BLASTPARAMS"
fi
### testing for output options - to be used later
# test whether the output is xml or not
if [[ '-outfmt 5' =~ "$BLASTPARAMS" ]]; then
if [[ '-outfmt 5' =~ $(echo -e "${BLASTPARAMS}" | sed -e 's/^[[:space:]]*//') ]]; then
XMLOUT=1
OUTOUT=0
elif [[ '-outfmt 6' =~ "$BLASTPARAMS" ]]; then
# test whether the output is plain tabular or not
elif [[ '-outfmt 6' =~ $(echo -e "${BLASTPARAMS}" | sed -e 's/^[[:space:]]*//') ]]; then
XMLOUT=0
OUTOUT=1
fi
......@@ -568,12 +588,7 @@ if [[ ! $FASTAPATH == /* ]]; then
FASTAPATH="$PWD/$FASTAPATH";
fi
#if [[ ! $DATABASEPATH == /* ]]; then
# DATABASEPATH="$PWD/$DATABASEPATH";
#fi
FASTA="$FASTAPATH/$FASTAID"
#DATABASE="$DATABASEPATH/$DATABASEID"
### setup blast and splitup executable; check if exist
allowed_executables="blastx blastp blastn"
......@@ -586,7 +601,7 @@ fi
export _arg_executable
### which is the reference directory size?
_arg_ramdisk=$(du -shL --block-size=1M "$_arg_database" | cut -f1 )M
_arg_ramdisk=$(du -shL --block-size=1M "$_arg_database" | cut -f1 )
if [[ ! $SCRIPT == /* ]]; then
SCRIPT="$PWD/$SCRIPT";
fi
......@@ -594,20 +609,37 @@ fi
# which cluster are we on?
cluster=$(sacctmgr show cluster -p| tail -n1| cut -f1 -d '|')
# if the cluster is Mogon I, set the memory default accordingly:
allowed_mem_setting=""
if [ "$cluster" == "mogon" ]; then
if [ $_arg_mem -ne 0 ]; then # user tries to select a non-default memory
allowed_mem_setting="115500 242500 497500"
if [[ ! $allowed_mem_settings =~ (^|[[:space:]])"_arg_mem"($|[[:space:]]) ]]; then
error "Memory selection out to be one of [$allowed_mem_settings]"
allowed_mem_settings="115500 242500 497500"
# add a savety measure (100 MB, each core)
for setting in $allowed_mem_settings; do
if [ $((_arg_ramdisk + 6400 )) -lt $setting ]; then
allowed_mem_setting="$allowed_mem_setting $setting"
fi
else # set a default memory
if [ "$_arg_executable" == "blastn" ]; then
_memory_request="242500M"
else
_memory_request="115500M"
done
# test whether there is any valid setting left
if [ -z "$allowed_mem_setting" ]; then
error "database > available + necessary RAM"
exit 1
fi
# remove first space, if any
allowed_mem_setting="${allowed_mem_setting/ /}"
if [ $_arg_mem -ne 0 ]; then # user tries to select a non-default memory
if [[ ! "$allowed_mem_setting" =~ (^|[[:space:]])"$_arg_mem"($|[[:space:]]) ]]; then
error "Memory selection out to be one of [$allowed_mem_setting]"
exit 1
fi
# select a default
else
_memory_request=$(echo $allowed_mem_setting | cut -d" " -f1)M
#if [ "$_arg_executable" == "blastn" ]; then
# _memory_request="242500M"
#else
# _memory_request="115500M"
#fi
fi
else
else # to be implemented for MII
if [ $_arg_mem -ne 0 ]; then # user tries to select a non-default memory
allowed_mem_setting="115500 242500 497500"
if [[ ! $allowed_mem_settings =~ (^|[[:space:]])"_arg_mem"($|[[:space:]]) ]]; then
......@@ -621,20 +653,27 @@ else
fi
fi
fi
# finally add another MB to the ramdisk to be save and the unit
_arg_ramdisk=$((_arg_ramdisk + 1024 ))M
# how many entries are there in the FASTA file?
### how many entries are there in the FASTA file?
nentries=$(grep '>' $FASTA | wc -l)
# we try to set the split number to a value, which ensures an output of
# ~ 10.000 split files
### check if this script is on node by checking env-variable $SLURM_JOB_ID, else send it to SLURM with given parameters and exit
if [ -z "$SLURM_JOB_ID" ]; then
export SCRIPT_PATH=$(dirname $0)
submit_statement="sbatch --no-requeue -o ${JOBTAG}_%j.out -J $JOBTAG -p $_arg_queue -A $_arg_assoc -t $_arg_runlimit -N $_arg_nodes -n $((64 * $_arg_nodes / $threads)) --mem=$_memory_request --ramdisk=${_arg_ramdisk} -c $threads"
submit_statement="sbatch -o ${JOBTAG}_%j.out -J $JOBTAG -p $_arg_queue -A $_arg_assoc -t $_arg_runlimit -N $_arg_nodes -n $((64 * $_arg_nodes / $threads)) --mem=$_memory_request --ramdisk=${_arg_ramdisk} -c $threads"
script_statement="$SCRIPT --partition $_arg_queue --account $_arg_assoc --nodes $_arg_nodes --time $_arg_runlimit --reservation=$_arg_reservation --threads $_arg_blast_threads --splitup $_arg_splitup_per_queryfile --blastparams=\"$BLASTPARAMS\" --executable=$_arg_executable "
if [[ $_arg_debug == "on" ]]; then
script_statement="${script_statement} --debug"
fi
# supply the input files regarless of any user request:
script_statement="${script_statement} $FASTA $DATABASE"
script_statement="$SCRIPT --partition $_arg_queue --account $_arg_assoc --nodes $_arg_nodes --time $_arg_runlimit --reservation=$_arg_reservation --threads $_arg_blast_threads --splitup $_arg_splitup_per_queryfile --blastparams=\"$BLASTPARAMS\" --executable=$_arg_executable $FASTA $DATABASE"
if [ -n "$_arg_reservation" ]; then
submit_statement="${submit_statement} --reservation=${_arg_reservation}"
fi
......@@ -706,7 +745,7 @@ if [ ! -d "$WORKDIR/$SPLITFILEDIR" ]; then
mkdir -p "$WORKDIR/output" || exit 1;
cd "$WORKDIR"
echo "executing scratch generator on $FASTA ($_arg_splitup_per_queryfile entries per file)"
"${SCRIPT_PATH}/splitter.py $FASTA $_arg_splitup_per_queryfile" & # splitup queryfile
eval "${SCRIPT_PATH}/splitter.py $FASTA $_arg_splitup_per_queryfile " & # splitup queryfile
queue $!
fi
......@@ -716,8 +755,12 @@ while [[ ! -z "$(echo $QUEUE| tr -d ' ')" ]]; do
sleep 5
done
DATABASE=$(find $RAMDISK -name "*${DBSUFFIX}" -print -quit)
#DATABASE=$RAMDISK/db${DBSUFFIX} #/$(basename $DATABASEPATH)
DATABASE=$(find $RAMDISK -type f -print -quit)
# strip suffix - if more than one dot
while [ $(grep -o "\." <<< $DATABASE | wc -l) -gt 1 ]; do
DATABASE=${DATABASE%.*}
done
if [[ -z $DATABASE ]]; then
error "Unable to recognize database, please get in touch with hpc@uni-mainz.de"
exit 1
......@@ -743,21 +786,14 @@ sbcast $cmdfile $newcmd
rm $cmdfile
cmdfile=$newcmd
echo "command file:"
cat $newcmd
echo
ls /localscratch/$SLURM_JOBID/ramdisk
samples=$(find $(pwd) -type f -name 'group*.fasta')
### append a finishing token to the samples
samples+=('done')
parallel="parallel --no-notice -j $SLURM_NTASKS -P $SLURM_NTASKS "
srun="srun --cpu-bind=q --mem-bind=q -n 1 -N1 --exclusive -c $SLURM_CPUS_PER_TASK --jobid $SLURM_JOBID --mem-per-cpu=$((SLURM_MEM_PER_NODE / SLURM_CPUS_ON_NODE))"
$parallel "$srun" "$cmdfile" ::: $(find $(pwd) -type f -name 'group*.fasta')
wait
$parallel "$srun" "$cmdfile" ::: $samples
n_unfinished_files=$(comm -3 <(cd output && find .| grep -o '[0-9]*' |sort ) <(cd scratch && find . | grep -o '[0-9]*' |sort )|wc -l)
if [ $n_unfinished_files -eq 0 ] && [[ $_arg_compress == "on" ]] && [ $XMLOUT -eq 1 ]; then
......@@ -777,38 +813,33 @@ if [ $n_unfinished_files -eq 0 ] && [[ $_arg_compress == "on" ]] && [ $XMLOUT -e
# extract footer information and write to outfile
zcat $some_file | tail -n3 >> $outfile
pigz -p 16 $outfile &
ENDC=$(date +%s.%N)
elapsedc=$(bc <<< "scale=1; (($ENDC-$STARTC))/60")
rm -rf $WORKDIR/$SPLITFILEDIR &
#rm ./output/group_*.xml &
#rm -rf ./scratch &
rm ./output/group_*.xml &
wait
ENDC=$(date +%s.%N)
elapsedc=$(bc <<< "scale=1; (($ENDC-$STARTC))/60")
# merge all standard output files (outfmt -6 -- tabular output) files
elif [ $n_unfinished_files -eq 0 ] && [[ $_arg_compress == "on" ]] && [ $OUTOUT -eq 1 ]; then
# shrink the alloction, such that only the minimum necessary is accounted for
#scontrol update job=$SLURM_JOB_ID NumNodes=1
pwd
# merge all xml files
STARTC=$(date +%s.%N)
outfile="${JOBTAG}.out"
# select the first of all files
some_file=$(find ./output -name 'group*' | head -n1)
# write anything to the output file
for split_file in ./output/group_*gz; do
zcat $split_file >> $outfile
rm $split_file
done
pigz -p 16 $outfile &
ENDC=$(date +%s.%N)
elapsedc=$(bc <<< "scale=1; (($ENDC-$STARTC))/60")
rm -rf $WORKDIR/$SPLITFILEDIR &
rmdir ./output &
wait
ENDC=$(date +%s.%N)
elapsedc=$(bc <<< "scale=1; (($ENDC-$STARTC))/60")
fi
# marks the end of this run
END=$(date +%s.%N)
elapsed=$(bc <<< "scale=1; (($END-$START))/60")
echo "parallel_BLAST took $elapsed minutes to run"
echo "parallel_BLAST took $elapsed minutes to run; the compression took $elapsedc minutes"
# TODO: Check: 1 output item per input scratch file?
# TODO: If not re-submit with correct/adjusted job size
......@@ -4,7 +4,10 @@ function cmdfilewriter()
cat <<EOF > $cmdfile
#!/bin/bash
module purge
#TODO: find a solution for the bug in BLAST+ AND to select the version by hand
module load bio/BLAST+/2.9.0-gompi-2019a
#module load bio/BLAST+/2.7.1-foss-2018a
# are we done?
source ${SCRIPT_PATH}/cleanup.sh
if [ \$1 = "done" ]; then
......@@ -16,15 +19,20 @@ cat <<EOF > $cmdfile
tmp_out=${JOBDIR}/\$outfname
trap "rm -f \$tmp_out" EXIT
START_BLAST=\$(date +%s)
$BLASTEXE -num_threads $SLURM_CPUS_PER_TASK -db $DATABASE $BLASTPARAMS -query \$1 -out \$tmp_out
$_arg_executable -num_threads $SLURM_CPUS_PER_TASK -db $DATABASE $BLASTPARAMS -query \$1 -out \$tmp_out
success=\$?
END_BLAST=\$(date +%s)
elapsed=\$(bc <<< "scale=1; \$((\$END_BLAST - \$START_BLAST))/60")
echo "Elapsed: \$elapsed"
#echo "Elapsed for '\$1': \$elapsed"
# compress, when done
gzip \$tmp_out
# copy back, when ready
mv \${tmp_out}.gz ./output/\${outfname}.gz
# only proceed, when ready
if [ \$success -eq 0 ]; then
# compress, when done
gzip \$tmp_out
# copy back, when ready
mv \${tmp_out}.gz ./output/\${outfname}.gz
fi
# we only consider the blast exit code for the total exit code
exit \$success
EOF
}
#!/usr/bin/env python
# dummy line to introduce a line break
import pip
# will take little time, if dependency is already satisfied
pip.main(['install', 'biopython'])
from Bio import SeqIO
import sys
import os
......@@ -22,7 +17,7 @@ batch = list()
for pos, entry in enumerate(record_iter):
if pos == 0:
group += 1
filename = 'group_%5d.fasta' % group
filename = 'group_%05d.fasta' % group
handle = open(os.path.join('scratch', filename), 'w')
if (pos % nlines == 0 and pos != 0):
count = SeqIO.write(batch, handle, 'fasta')
......@@ -30,10 +25,10 @@ for pos, entry in enumerate(record_iter):
handle.close()
batch = list()
group += 1
filename = 'group_%s.fasta' % group
filename = 'group_%05d.fasta' % group
handle = open(os.path.join('scratch', filename), 'w')
batch.append(entry)
# take care of the rest
count = SeqIO.write(batch, handle, 'fasta')
print('Wrote %s records to %s' % (count, filename))
#print('Wrote %s records to %s' % (count, filename))
handle.close()
......@@ -4,12 +4,9 @@ cat <<EOF > $stagefile
#!/bin/bash
target=/localscratch/$SLURM_JOB_ID/ramdisk
cd \$target
for fname in \$(find ${DATABASEPATH} -type f ); do
#suffix=\${fname#*.}
outfile=\$(basename \${fname})
cp -L \$fname \$outfile
done
parallel -j 4 cp {} {/} ::: \$(find -L ${DATABASEPATH} -type f )
cd -
wait
EOF
}
......
......@@ -534,7 +534,7 @@ if [ $_arg_paired -eq 1 ]; then
if [[ ${samples[0]} == *"_R1"* || ${samples[0]} == *"_R2"* ]]; then
first='_R1'
second='_R2'
elif [[ ${samples[0]} == *"_1"* ]]; then
elif [[ ${samples[0]} == *"_1"* ]] || [[ ${samples[0]} == *"_2"* ]]; then
first='_1'
second='_2'
else
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment