Commit 1d6458be authored by Christian Meesters's avatar Christian Meesters

Merge branch 'devel' into 'master'

Devel

Closes #30

See merge request !18
parents 5509f2df cd18464c
...@@ -12,14 +12,16 @@ easyblock = 'Binary' ...@@ -12,14 +12,16 @@ easyblock = 'Binary'
sources = ['./parallel_BLAST/LA_Wrapper', sources = ['./parallel_BLAST/LA_Wrapper',
'./parallel_BLAST/cleanup.sh', './parallel_BLAST/cleanup.sh',
'./parallel_BLAST/blast_wrap.sh', './parallel_BLAST/blast_wrap.sh',
'./parallel_BLAST/splitter.py'] './parallel_BLAST/splitter.py',
'./parallel_BLAST/stage_in.sh']
unpack_sources = False unpack_sources = False
files_to_copy = ['LA_Wrapper', files_to_copy = ['LA_Wrapper',
'cleanup.sh', 'cleanup.sh',
'blast_wrap.sh', 'blast_wrap.sh',
'splitter.py'] 'splitter.py',
'stage_in.sh']
postinstallcmds = ['mv %(installdir)s/parallel_BLAST/* %(installdir)s && rmdir %(installdir)s/parallel_BLAST'] postinstallcmds = ['mv %(installdir)s/parallel_BLAST/* %(installdir)s && rmdir %(installdir)s/parallel_BLAST']
......
...@@ -43,8 +43,7 @@ module purge ...@@ -43,8 +43,7 @@ module purge
# load the most current version of GNU parallel # load the most current version of GNU parallel
module load tools/parallel/20190822 module load tools/parallel/20190822
#module load lang/Python/3.6.4-foss-2018a module load bio/Biopython/1.74-foss-2019a
module load lang/Python/3.7.4-GCCcore-8.3.0
#TODO: find a solution for the bug in BLAST+ AND to select the version by hand #TODO: find a solution for the bug in BLAST+ AND to select the version by hand
module load bio/BLAST+/2.9.0-gompi-2019a module load bio/BLAST+/2.9.0-gompi-2019a
#module load bio/BLAST+/2.7.1-foss-2018a #module load bio/BLAST+/2.7.1-foss-2018a
...@@ -53,7 +52,7 @@ module load bio/BLAST+/2.9.0-gompi-2019a ...@@ -53,7 +52,7 @@ module load bio/BLAST+/2.9.0-gompi-2019a
### setup variable for THIS script; giving absolute path if necessary ### setup variable for THIS script; giving absolute path if necessary
SCRIPT="$0" SCRIPT="$0"
SCRIPT_VERSION="0.5" SCRIPT_VERSION="0.5.3"
# TODO: delete the following 3 functions, once sbcast is working # TODO: delete the following 3 functions, once sbcast is working
function queue { function queue {
...@@ -99,6 +98,14 @@ begins_with_short_option() ...@@ -99,6 +98,14 @@ begins_with_short_option()
test "$all_short_options" = "${all_short_options/$first_option/}" && return 1 || return 0 test "$all_short_options" = "${all_short_options/$first_option/}" && return 1 || return 0
} }
check_nucleotide_db() {
echo "check not yet implemented"
}
check_protein_db() {
echo "check not yet implemented"
}
# function to redirect simple error messages to stderr # function to redirect simple error messages to stderr
error() { error() {
(>&2 echo "ERROR: $1") (>&2 echo "ERROR: $1")
...@@ -128,11 +135,12 @@ _arg_blastdir='.' ...@@ -128,11 +135,12 @@ _arg_blastdir='.'
_arg_executable='blastx' _arg_executable='blastx'
_arg_test=off _arg_test=off
_arg_compress=on _arg_compress=on
_arg_debug=off
print_help () print_help ()
{ {
echo "This script's help msg" echo "This script's help msg"
printf 'Usage: %s [-l|--runlimit <arg>] [-p|--partition <arg>] [-s|--splitup <arg>] [-N|--nodes <arg>] [--executable <arg>] [-m|--mem <arg>] [--blastparams <string>] [-r|--ramdisk <arg>] [--blastdir <arg>] [--(no-)test] [-h|--help] <FASTA> <DATABASE>\n' "$(basename $0)\n" printf 'Usage: %s [-l|--runlimit <arg>] [-p|--partition <arg>] [-N|--nodes <arg>] [--executable <arg>] [--blastparams <string>] [-r|--ramdisk <arg>] [--blastdir <arg>] [--(no-)test] [-h|--help] <FASTA> <DATABASE>\n' "$(basename $0)\n"
printf 'HINT: The FASTA and DATABASE items need to be full paths to files.\n' printf 'HINT: The FASTA and DATABASE items need to be full paths to files.\n'
printf "\\t\\033[1m%s\\033[0m\\t\\t%s\\n" "<FASTA>" "path to the query FASTA file" printf "\\t\\033[1m%s\\033[0m\\t\\t%s\\n" "<FASTA>" "path to the query FASTA file"
printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "<DATABASE>" "path to the database file" printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "<DATABASE>" "path to the database file"
...@@ -142,11 +150,8 @@ print_help () ...@@ -142,11 +150,8 @@ print_help ()
printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "-N,--nodes" "number of nodes (1 is the default)" printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "-N,--nodes" "number of nodes (1 is the default)"
printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "--reservation" "reservation to use (none is the default)" printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "--reservation" "reservation to use (none is the default)"
printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "--time" "time in minutes (300 is the default)" printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "--time" "time in minutes (300 is the default)"
printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "-m,--mem" "memory which is required per node (defaults to 115500 M, but should be min. 242500 M for blastn, omit the unit for submitting)"
printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "-r,--ramdisk" "ramdisk size in units of GiB (default is 40 GiB)"
printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "-t,--threads" "blast threads (default is 1)" printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "-t,--threads" "blast threads (default is 1)"
printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "--blastparams" "blast parameters (default is -outfmt 6 (for blank tabulated output))" printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "--blastparams" "blast parameters (default is -outfmt 6 (for blank tabulated output))"
printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "-s,--splitup" "No. of FASTA sequences per query file (default is to generate ~5000 files)"
printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "--blastdir" "output directory (default is composition of input names)" printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "--blastdir" "output directory (default is composition of input names)"
printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "--executable" "choose executable (currently only from NCBI-BLAST, default: blastx)" printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "--executable" "choose executable (currently only from NCBI-BLAST, default: blastx)"
printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "--compress" "if set, the output files will be merged and compressed (time consuming!, defaultt: off)" printf "\\t\\033[1m%s\\033[0m\\t%s\\n" "--compress" "if set, the output files will be merged and compressed (time consuming!, defaultt: off)"
...@@ -217,6 +222,11 @@ credits() ...@@ -217,6 +222,11 @@ credits()
echo " - faster stage-in for reference data" echo " - faster stage-in for reference data"
echo " - automerge for -outfmt=6" echo " - automerge for -outfmt=6"
echo " - -outfmt=6 is now the default" echo " - -outfmt=6 is now the default"
echo "- v0.5.1 -- 29. Aug. 2019 -- numerous housekeeping fixes"
echo "- v0.5.2 -- 02. Sep. 2019 -- fix:"
echo " - consistent biopython inclusion"
echo " - auto-detection of database size and memory selection"
echo "- v0.5.3 -- 24. Sep. 2019 -- fix: dereferencing of reference / database files re-enabled"
echo echo
echo "Current version is: $SCRIPT_VERSION" echo "Current version is: $SCRIPT_VERSION"
echo echo
...@@ -406,6 +416,10 @@ do ...@@ -406,6 +416,10 @@ do
_arg_test="on" _arg_test="on"
test "${1:0:5}" = "--no-" && _arg_test="off" test "${1:0:5}" = "--no-" && _arg_test="off"
;; ;;
--debug)
_arg_debug="on"
set -x
;;
--credits|--version) --credits|--version)
credits credits
exit 0 exit 0
...@@ -437,7 +451,7 @@ done ...@@ -437,7 +451,7 @@ done
### get query and database files ### get query and database files
FASTA=$_arg_fasta FASTA=$_arg_fasta
DATABASE=$_arg_database DATABASE=$(realpath $_arg_database)
### check if query & database exist ### check if query & database exist
if [[ $_arg_test == "off" ]] && [ ! -e "$FASTA" ]; then if [[ $_arg_test == "off" ]] && [ ! -e "$FASTA" ]; then
...@@ -453,6 +467,7 @@ fi ...@@ -453,6 +467,7 @@ fi
if [ "blastx" = "${_arg_executable,,}" ]; then if [ "blastx" = "${_arg_executable,,}" ]; then
executable="blastx" executable="blastx"
threads=2 threads=2
DBSUFFIX=".nal" # db suffix to be removed from nal file - not working, if nal file not present
if [ -z "$SLURM_JOB_ID" ]; then if [ -z "$SLURM_JOB_ID" ]; then
source $(dirname "$0")/blast_wrap.sh source $(dirname "$0")/blast_wrap.sh
else else
...@@ -461,6 +476,7 @@ if [ "blastx" = "${_arg_executable,,}" ]; then ...@@ -461,6 +476,7 @@ if [ "blastx" = "${_arg_executable,,}" ]; then
elif [ "blastn" = "${_arg_executable,,}" ]; then elif [ "blastn" = "${_arg_executable,,}" ]; then
executable="blastn" executable="blastn"
threads=8 threads=8
#check_nucleotide_db
if [ -z "$SLURM_JOB_ID" ]; then if [ -z "$SLURM_JOB_ID" ]; then
source $(dirname "$0")/blast_wrap.sh source $(dirname "$0")/blast_wrap.sh
else else
...@@ -469,6 +485,7 @@ elif [ "blastn" = "${_arg_executable,,}" ]; then ...@@ -469,6 +485,7 @@ elif [ "blastn" = "${_arg_executable,,}" ]; then
elif [ "blastp" = "${_arg_executable,,}" ]; then elif [ "blastp" = "${_arg_executable,,}" ]; then
executable="blastp" executable="blastp"
threads=2 threads=2
#check_protein_db
if [ -z "$SLURM_JOB_ID" ]; then if [ -z "$SLURM_JOB_ID" ]; then
source $(dirname "$0")/blast_wrap.sh source $(dirname "$0")/blast_wrap.sh
else else
...@@ -518,11 +535,14 @@ if [[ "$_arg_blastparams" =~ "outfmt" ]]; then ...@@ -518,11 +535,14 @@ if [[ "$_arg_blastparams" =~ "outfmt" ]]; then
else else
BLASTPARAMS="${_arg_blastparams} $DEFAULT_BLASTPARAMS" BLASTPARAMS="${_arg_blastparams} $DEFAULT_BLASTPARAMS"
fi fi
### testing for output options - to be used later
# test whether the output is xml or not # test whether the output is xml or not
if [[ '-outfmt 5' =~ "$BLASTPARAMS" ]]; then if [[ '-outfmt 5' =~ $(echo -e "${BLASTPARAMS}" | sed -e 's/^[[:space:]]*//') ]]; then
XMLOUT=1 XMLOUT=1
OUTOUT=0 OUTOUT=0
elif [[ '-outfmt 6' =~ "$BLASTPARAMS" ]]; then # test whether the output is plain tabular or not
elif [[ '-outfmt 6' =~ $(echo -e "${BLASTPARAMS}" | sed -e 's/^[[:space:]]*//') ]]; then
XMLOUT=0 XMLOUT=0
OUTOUT=1 OUTOUT=1
fi fi
...@@ -568,12 +588,7 @@ if [[ ! $FASTAPATH == /* ]]; then ...@@ -568,12 +588,7 @@ if [[ ! $FASTAPATH == /* ]]; then
FASTAPATH="$PWD/$FASTAPATH"; FASTAPATH="$PWD/$FASTAPATH";
fi fi
#if [[ ! $DATABASEPATH == /* ]]; then
# DATABASEPATH="$PWD/$DATABASEPATH";
#fi
FASTA="$FASTAPATH/$FASTAID" FASTA="$FASTAPATH/$FASTAID"
#DATABASE="$DATABASEPATH/$DATABASEID"
### setup blast and splitup executable; check if exist ### setup blast and splitup executable; check if exist
allowed_executables="blastx blastp blastn" allowed_executables="blastx blastp blastn"
...@@ -586,7 +601,7 @@ fi ...@@ -586,7 +601,7 @@ fi
export _arg_executable export _arg_executable
### which is the reference directory size? ### which is the reference directory size?
_arg_ramdisk=$(du -shL --block-size=1M "$_arg_database" | cut -f1 )M _arg_ramdisk=$(du -shL --block-size=1M "$_arg_database" | cut -f1 )
if [[ ! $SCRIPT == /* ]]; then if [[ ! $SCRIPT == /* ]]; then
SCRIPT="$PWD/$SCRIPT"; SCRIPT="$PWD/$SCRIPT";
fi fi
...@@ -594,20 +609,37 @@ fi ...@@ -594,20 +609,37 @@ fi
# which cluster are we on? # which cluster are we on?
cluster=$(sacctmgr show cluster -p| tail -n1| cut -f1 -d '|') cluster=$(sacctmgr show cluster -p| tail -n1| cut -f1 -d '|')
# if the cluster is Mogon I, set the memory default accordingly: # if the cluster is Mogon I, set the memory default accordingly:
allowed_mem_setting=""
if [ "$cluster" == "mogon" ]; then if [ "$cluster" == "mogon" ]; then
if [ $_arg_mem -ne 0 ]; then # user tries to select a non-default memory allowed_mem_settings="115500 242500 497500"
allowed_mem_setting="115500 242500 497500" # add a savety measure (100 MB, each core)
if [[ ! $allowed_mem_settings =~ (^|[[:space:]])"_arg_mem"($|[[:space:]]) ]]; then for setting in $allowed_mem_settings; do
error "Memory selection out to be one of [$allowed_mem_settings]" if [ $((_arg_ramdisk + 6400 )) -lt $setting ]; then
allowed_mem_setting="$allowed_mem_setting $setting"
fi fi
else # set a default memory done
if [ "$_arg_executable" == "blastn" ]; then # test whether there is any valid setting left
_memory_request="242500M" if [ -z "$allowed_mem_setting" ]; then
else error "database > available + necessary RAM"
_memory_request="115500M" exit 1
fi
# remove first space, if any
allowed_mem_setting="${allowed_mem_setting/ /}"
if [ $_arg_mem -ne 0 ]; then # user tries to select a non-default memory
if [[ ! "$allowed_mem_setting" =~ (^|[[:space:]])"$_arg_mem"($|[[:space:]]) ]]; then
error "Memory selection out to be one of [$allowed_mem_setting]"
exit 1
fi fi
# select a default
else
_memory_request=$(echo $allowed_mem_setting | cut -d" " -f1)M
#if [ "$_arg_executable" == "blastn" ]; then
# _memory_request="242500M"
#else
# _memory_request="115500M"
#fi
fi fi
else else # to be implemented for MII
if [ $_arg_mem -ne 0 ]; then # user tries to select a non-default memory if [ $_arg_mem -ne 0 ]; then # user tries to select a non-default memory
allowed_mem_setting="115500 242500 497500" allowed_mem_setting="115500 242500 497500"
if [[ ! $allowed_mem_settings =~ (^|[[:space:]])"_arg_mem"($|[[:space:]]) ]]; then if [[ ! $allowed_mem_settings =~ (^|[[:space:]])"_arg_mem"($|[[:space:]]) ]]; then
...@@ -621,20 +653,27 @@ else ...@@ -621,20 +653,27 @@ else
fi fi
fi fi
fi fi
# finally add another MB to the ramdisk to be save and the unit
_arg_ramdisk=$((_arg_ramdisk + 1024 ))M
# how many entries are there in the FASTA file? ### how many entries are there in the FASTA file?
nentries=$(grep '>' $FASTA | wc -l) nentries=$(grep '>' $FASTA | wc -l)
# we try to set the split number to a value, which ensures an output of # we try to set the split number to a value, which ensures an output of
# ~ 10.000 split files # ~ 10.000 split files
### check if this script is on node by checking env-variable $SLURM_JOB_ID, else send it to SLURM with given parameters and exit ### check if this script is on node by checking env-variable $SLURM_JOB_ID, else send it to SLURM with given parameters and exit
if [ -z "$SLURM_JOB_ID" ]; then if [ -z "$SLURM_JOB_ID" ]; then
export SCRIPT_PATH=$(dirname $0) export SCRIPT_PATH=$(dirname $0)
submit_statement="sbatch --no-requeue -o ${JOBTAG}_%j.out -J $JOBTAG -p $_arg_queue -A $_arg_assoc -t $_arg_runlimit -N $_arg_nodes -n $((64 * $_arg_nodes / $threads)) --mem=$_memory_request --ramdisk=${_arg_ramdisk} -c $threads" submit_statement="sbatch -o ${JOBTAG}_%j.out -J $JOBTAG -p $_arg_queue -A $_arg_assoc -t $_arg_runlimit -N $_arg_nodes -n $((64 * $_arg_nodes / $threads)) --mem=$_memory_request --ramdisk=${_arg_ramdisk} -c $threads"
script_statement="$SCRIPT --partition $_arg_queue --account $_arg_assoc --nodes $_arg_nodes --time $_arg_runlimit --reservation=$_arg_reservation --threads $_arg_blast_threads --splitup $_arg_splitup_per_queryfile --blastparams=\"$BLASTPARAMS\" --executable=$_arg_executable "
if [[ $_arg_debug == "on" ]]; then
script_statement="${script_statement} --debug"
fi
# supply the input files regarless of any user request:
script_statement="${script_statement} $FASTA $DATABASE"
script_statement="$SCRIPT --partition $_arg_queue --account $_arg_assoc --nodes $_arg_nodes --time $_arg_runlimit --reservation=$_arg_reservation --threads $_arg_blast_threads --splitup $_arg_splitup_per_queryfile --blastparams=\"$BLASTPARAMS\" --executable=$_arg_executable $FASTA $DATABASE"
if [ -n "$_arg_reservation" ]; then if [ -n "$_arg_reservation" ]; then
submit_statement="${submit_statement} --reservation=${_arg_reservation}" submit_statement="${submit_statement} --reservation=${_arg_reservation}"
fi fi
...@@ -706,7 +745,7 @@ if [ ! -d "$WORKDIR/$SPLITFILEDIR" ]; then ...@@ -706,7 +745,7 @@ if [ ! -d "$WORKDIR/$SPLITFILEDIR" ]; then
mkdir -p "$WORKDIR/output" || exit 1; mkdir -p "$WORKDIR/output" || exit 1;
cd "$WORKDIR" cd "$WORKDIR"
echo "executing scratch generator on $FASTA ($_arg_splitup_per_queryfile entries per file)" echo "executing scratch generator on $FASTA ($_arg_splitup_per_queryfile entries per file)"
"${SCRIPT_PATH}/splitter.py $FASTA $_arg_splitup_per_queryfile" & # splitup queryfile eval "${SCRIPT_PATH}/splitter.py $FASTA $_arg_splitup_per_queryfile " & # splitup queryfile
queue $! queue $!
fi fi
...@@ -716,8 +755,12 @@ while [[ ! -z "$(echo $QUEUE| tr -d ' ')" ]]; do ...@@ -716,8 +755,12 @@ while [[ ! -z "$(echo $QUEUE| tr -d ' ')" ]]; do
sleep 5 sleep 5
done done
DATABASE=$(find $RAMDISK -name "*${DBSUFFIX}" -print -quit) DATABASE=$(find $RAMDISK -type f -print -quit)
#DATABASE=$RAMDISK/db${DBSUFFIX} #/$(basename $DATABASEPATH) # strip suffix - if more than one dot
while [ $(grep -o "\." <<< $DATABASE | wc -l) -gt 1 ]; do
DATABASE=${DATABASE%.*}
done
if [[ -z $DATABASE ]]; then if [[ -z $DATABASE ]]; then
error "Unable to recognize database, please get in touch with hpc@uni-mainz.de" error "Unable to recognize database, please get in touch with hpc@uni-mainz.de"
exit 1 exit 1
...@@ -743,21 +786,14 @@ sbcast $cmdfile $newcmd ...@@ -743,21 +786,14 @@ sbcast $cmdfile $newcmd
rm $cmdfile rm $cmdfile
cmdfile=$newcmd cmdfile=$newcmd
echo "command file:" samples=$(find $(pwd) -type f -name 'group*.fasta')
cat $newcmd
echo
ls /localscratch/$SLURM_JOBID/ramdisk
### append a finishing token to the samples ### append a finishing token to the samples
samples+=('done') samples+=('done')
parallel="parallel --no-notice -j $SLURM_NTASKS -P $SLURM_NTASKS " parallel="parallel --no-notice -j $SLURM_NTASKS -P $SLURM_NTASKS "
srun="srun --cpu-bind=q --mem-bind=q -n 1 -N1 --exclusive -c $SLURM_CPUS_PER_TASK --jobid $SLURM_JOBID --mem-per-cpu=$((SLURM_MEM_PER_NODE / SLURM_CPUS_ON_NODE))" srun="srun --cpu-bind=q --mem-bind=q -n 1 -N1 --exclusive -c $SLURM_CPUS_PER_TASK --jobid $SLURM_JOBID --mem-per-cpu=$((SLURM_MEM_PER_NODE / SLURM_CPUS_ON_NODE))"
$parallel "$srun" "$cmdfile" ::: $(find $(pwd) -type f -name 'group*.fasta') $parallel "$srun" "$cmdfile" ::: $samples
wait
n_unfinished_files=$(comm -3 <(cd output && find .| grep -o '[0-9]*' |sort ) <(cd scratch && find . | grep -o '[0-9]*' |sort )|wc -l) n_unfinished_files=$(comm -3 <(cd output && find .| grep -o '[0-9]*' |sort ) <(cd scratch && find . | grep -o '[0-9]*' |sort )|wc -l)
if [ $n_unfinished_files -eq 0 ] && [[ $_arg_compress == "on" ]] && [ $XMLOUT -eq 1 ]; then if [ $n_unfinished_files -eq 0 ] && [[ $_arg_compress == "on" ]] && [ $XMLOUT -eq 1 ]; then
...@@ -777,38 +813,33 @@ if [ $n_unfinished_files -eq 0 ] && [[ $_arg_compress == "on" ]] && [ $XMLOUT -e ...@@ -777,38 +813,33 @@ if [ $n_unfinished_files -eq 0 ] && [[ $_arg_compress == "on" ]] && [ $XMLOUT -e
# extract footer information and write to outfile # extract footer information and write to outfile
zcat $some_file | tail -n3 >> $outfile zcat $some_file | tail -n3 >> $outfile
pigz -p 16 $outfile & pigz -p 16 $outfile &
ENDC=$(date +%s.%N)
elapsedc=$(bc <<< "scale=1; (($ENDC-$STARTC))/60")
rm -rf $WORKDIR/$SPLITFILEDIR & rm -rf $WORKDIR/$SPLITFILEDIR &
#rm ./output/group_*.xml & rm ./output/group_*.xml &
#rm -rf ./scratch &
wait wait
ENDC=$(date +%s.%N)
elapsedc=$(bc <<< "scale=1; (($ENDC-$STARTC))/60")
# merge all standard output files (outfmt -6 -- tabular output) files
elif [ $n_unfinished_files -eq 0 ] && [[ $_arg_compress == "on" ]] && [ $OUTOUT -eq 1 ]; then elif [ $n_unfinished_files -eq 0 ] && [[ $_arg_compress == "on" ]] && [ $OUTOUT -eq 1 ]; then
# shrink the alloction, such that only the minimum necessary is accounted for
#scontrol update job=$SLURM_JOB_ID NumNodes=1
pwd
# merge all xml files
STARTC=$(date +%s.%N) STARTC=$(date +%s.%N)
outfile="${JOBTAG}.out" outfile="${JOBTAG}.out"
# select the first of all files
some_file=$(find ./output -name 'group*' | head -n1)
# write anything to the output file # write anything to the output file
for split_file in ./output/group_*gz; do for split_file in ./output/group_*gz; do
zcat $split_file >> $outfile zcat $split_file >> $outfile
rm $split_file rm $split_file
done done
pigz -p 16 $outfile & pigz -p 16 $outfile &
ENDC=$(date +%s.%N)
elapsedc=$(bc <<< "scale=1; (($ENDC-$STARTC))/60")
rm -rf $WORKDIR/$SPLITFILEDIR & rm -rf $WORKDIR/$SPLITFILEDIR &
rmdir ./output &
wait wait
ENDC=$(date +%s.%N)
elapsedc=$(bc <<< "scale=1; (($ENDC-$STARTC))/60")
fi fi
# marks the end of this run # marks the end of this run
END=$(date +%s.%N) END=$(date +%s.%N)
elapsed=$(bc <<< "scale=1; (($END-$START))/60") elapsed=$(bc <<< "scale=1; (($END-$START))/60")
echo "parallel_BLAST took $elapsed minutes to run" echo "parallel_BLAST took $elapsed minutes to run; the compression took $elapsedc minutes"
# TODO: Check: 1 output item per input scratch file? # TODO: Check: 1 output item per input scratch file?
# TODO: If not re-submit with correct/adjusted job size # TODO: If not re-submit with correct/adjusted job size
...@@ -4,7 +4,10 @@ function cmdfilewriter() ...@@ -4,7 +4,10 @@ function cmdfilewriter()
cat <<EOF > $cmdfile cat <<EOF > $cmdfile
#!/bin/bash #!/bin/bash
module purge module purge
#TODO: find a solution for the bug in BLAST+ AND to select the version by hand
module load bio/BLAST+/2.9.0-gompi-2019a module load bio/BLAST+/2.9.0-gompi-2019a
#module load bio/BLAST+/2.7.1-foss-2018a
# are we done? # are we done?
source ${SCRIPT_PATH}/cleanup.sh source ${SCRIPT_PATH}/cleanup.sh
if [ \$1 = "done" ]; then if [ \$1 = "done" ]; then
...@@ -16,15 +19,20 @@ cat <<EOF > $cmdfile ...@@ -16,15 +19,20 @@ cat <<EOF > $cmdfile
tmp_out=${JOBDIR}/\$outfname tmp_out=${JOBDIR}/\$outfname
trap "rm -f \$tmp_out" EXIT trap "rm -f \$tmp_out" EXIT
START_BLAST=\$(date +%s) START_BLAST=\$(date +%s)
$BLASTEXE -num_threads $SLURM_CPUS_PER_TASK -db $DATABASE $BLASTPARAMS -query \$1 -out \$tmp_out $_arg_executable -num_threads $SLURM_CPUS_PER_TASK -db $DATABASE $BLASTPARAMS -query \$1 -out \$tmp_out
success=\$?
END_BLAST=\$(date +%s) END_BLAST=\$(date +%s)
elapsed=\$(bc <<< "scale=1; \$((\$END_BLAST - \$START_BLAST))/60") elapsed=\$(bc <<< "scale=1; \$((\$END_BLAST - \$START_BLAST))/60")
echo "Elapsed: \$elapsed" #echo "Elapsed for '\$1': \$elapsed"
# compress, when done # only proceed, when ready
gzip \$tmp_out if [ \$success -eq 0 ]; then
# compress, when done
# copy back, when ready gzip \$tmp_out
mv \${tmp_out}.gz ./output/\${outfname}.gz # copy back, when ready
mv \${tmp_out}.gz ./output/\${outfname}.gz
fi
# we only consider the blast exit code for the total exit code
exit \$success
EOF EOF
} }
#!/usr/bin/env python #!/usr/bin/env python
# dummy line to introduce a line break
import pip
# will take little time, if dependency is already satisfied
pip.main(['install', 'biopython'])
from Bio import SeqIO from Bio import SeqIO
import sys import sys
import os import os
...@@ -22,7 +17,7 @@ batch = list() ...@@ -22,7 +17,7 @@ batch = list()
for pos, entry in enumerate(record_iter): for pos, entry in enumerate(record_iter):
if pos == 0: if pos == 0:
group += 1 group += 1
filename = 'group_%5d.fasta' % group filename = 'group_%05d.fasta' % group
handle = open(os.path.join('scratch', filename), 'w') handle = open(os.path.join('scratch', filename), 'w')
if (pos % nlines == 0 and pos != 0): if (pos % nlines == 0 and pos != 0):
count = SeqIO.write(batch, handle, 'fasta') count = SeqIO.write(batch, handle, 'fasta')
...@@ -30,10 +25,10 @@ for pos, entry in enumerate(record_iter): ...@@ -30,10 +25,10 @@ for pos, entry in enumerate(record_iter):
handle.close() handle.close()
batch = list() batch = list()
group += 1 group += 1
filename = 'group_%s.fasta' % group filename = 'group_%05d.fasta' % group
handle = open(os.path.join('scratch', filename), 'w') handle = open(os.path.join('scratch', filename), 'w')
batch.append(entry) batch.append(entry)
# take care of the rest # take care of the rest
count = SeqIO.write(batch, handle, 'fasta') count = SeqIO.write(batch, handle, 'fasta')
print('Wrote %s records to %s' % (count, filename)) #print('Wrote %s records to %s' % (count, filename))
handle.close() handle.close()
...@@ -4,12 +4,9 @@ cat <<EOF > $stagefile ...@@ -4,12 +4,9 @@ cat <<EOF > $stagefile
#!/bin/bash #!/bin/bash
target=/localscratch/$SLURM_JOB_ID/ramdisk target=/localscratch/$SLURM_JOB_ID/ramdisk
cd \$target cd \$target
for fname in \$(find ${DATABASEPATH} -type f ); do parallel -j 4 cp {} {/} ::: \$(find -L ${DATABASEPATH} -type f )
#suffix=\${fname#*.}
outfile=\$(basename \${fname})
cp -L \$fname \$outfile
done
cd - cd -
wait
EOF EOF
} }
......
...@@ -534,7 +534,7 @@ if [ $_arg_paired -eq 1 ]; then ...@@ -534,7 +534,7 @@ if [ $_arg_paired -eq 1 ]; then
if [[ ${samples[0]} == *"_R1"* || ${samples[0]} == *"_R2"* ]]; then if [[ ${samples[0]} == *"_R1"* || ${samples[0]} == *"_R2"* ]]; then
first='_R1' first='_R1'
second='_R2' second='_R2'
elif [[ ${samples[0]} == *"_1"* ]]; then elif [[ ${samples[0]} == *"_1"* ]] || [[ ${samples[0]} == *"_2"* ]]; then
first='_1' first='_1'
second='_2' second='_2'
else else
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment