...
 
Commits (11)
......@@ -41,9 +41,15 @@ START=$(date +%s.%N)
module purge
# load the most current version of GNU parallel
module load tools/parallel
module load tools/parallel/20190822
#module load lang/Python/3.6.4-foss-2018a
module load lang/Python/3.7.4-GCCcore-8.3.0
#TODO: find a solution for the bug in BLAST+ AND to select the version by hand
module load bio/BLAST+/2.9.0-gompi-2019a
#module load bio/BLAST+/2.7.1-foss-2018a
module load lang/Python/3.6.4-foss-2018a
### setup variable for THIS script; giving absolute path if necessary
SCRIPT="$0"
......@@ -121,7 +127,7 @@ declare _arg_ramdisk=40G
_arg_blastdir='.'
_arg_executable='blastx'
_arg_test=off
_arg_compress=off
_arg_compress=on
print_help ()
{
......@@ -444,7 +450,6 @@ if [[ $_arg_test == "off" ]] && [ ! -d "$DATABASE" ]; then
exit 1
fi
#TODO: differentiate between blastn,x,p -- for now, all are equal
if [ "blastx" = "${_arg_executable,,}" ]; then
executable="blastx"
threads=2
......@@ -545,7 +550,7 @@ if [ -z "$SLURM_JOB_ID" ] && [[ $_arg_test == "off" ]]; then
echo "removing directory $JOBTAG"
rm -r $JOBTAG
else
echo "So you want to continue regardless? (e.g. scratch files already existing) ([y]/n)"
echo "So, you want to continue regardless (using the existing scratch files)? ([y]/n)"
echo -n '>'
read ENTER
if [[ ${ENTER,,} = 'n' || ${ENTER,,} == 'no' ]] ; then
......@@ -572,16 +577,13 @@ FASTA="$FASTAPATH/$FASTAID"
### setup blast and splitup executable; check if exist
allowed_executables="blastx blastp blastn"
if [[ ! $allowed_executables =~ (^|[[:space:]])"$_arg_executable"($|[[:space:]]) ]]; then
# BLASTEXE=$(which $_arg_executable)
#else
if [[ ! $allowed_executables =~ (^| [[:space:]])"$_arg_executable"($| ) ]]; then
BLASTEXE=$(which $_arg_executable)
else
error "$_arg_executable ought to be one of [$allowed_executables]"
exit 1
fi
BLASTEXE=$(which $_arg_executable)
if [ -z $BLASTEXE ] && [[ $_arg_test == "off" ]]; then
error "'$BLASTEXE' : not found, please load an appropriate module before restarting."
exit
fi
export _arg_executable
### which is the reference directory size?
_arg_ramdisk=$(du -shL --block-size=1M "$_arg_database" | cut -f1 )M
......@@ -670,40 +672,30 @@ RAMDISK=$JOBDIR/ramdisk
HOSTLIST=$(scontrol show hostname $SLURM_JOB_NODELIST | paste -d, -s | tr ',', ' ')
QUEUE=''
#myhost=$(hostname -f)
stagefile=/localscratch/$SLURM_JOB_ID/dummy_stagein.sh
rstagefile=/localscratch/$SLURM_JOB_ID/stagein.sh
source "${SCRIPT_PATH}"/stage_in.sh
stage_in_writer
chmod +x $stagefile
# distribute the stagewriter
sbcast $stagefile $rstagefile
rm $stagefile
stagefile=$rstagefile
# we would not need this loop with regard to slurm, but as we have
# asynchronous tasks already, we keep track with the queue
for HOST in $HOSTLIST; do
if [ -L ${DATABASEPATH} ]; then
warning "If the reference directory is a link, fast stage-in is not possible."
for fname in ${DATABASEPATH}/*; do
eval "ssh $HOST cp -L $fname ${RAMDISK}/$(basename $fname)" &
PID=$!
queue $PID
done
else
for fname in ${DATABASEPATH}/*; do
if [ -L "$fname" ]; then
eval "ssh $HOST cp -L $fname ${RAMDISK}/$(basename $fname)" &
PID=$!
queue $PID
else
eval "ssh $HOST dd bs=4096 if=$fname of=${RAMDISK}/$(basename $fname)" &
PID=$!
queue $PID
fi
done
fi
# TODO: check for dereferencing links, before enabling
# TODO: check for performance, before re-enabling
#sbcast $FILE $RAMDISK/$(basename $FILE)
srun -w $HOST -N1 -n1 -c1 --mem-per-cpu=5000M $stagefile &
queue $!
done
#DATABASE=$RAMDISK/$DATABASE
DATABASE=$RAMDISK #/$(basename $DATABASEPATH)
WORKDIR=$PWD/$BLASTDIR/$SLURM_JOB_NAME
# this script may never output to a user's $HOME
if [[ *"$WORKDIR"* = 'home' ]]; then
eror "Cowardly refusing to operate in a home directory."
error "Cowardly refusing to operate in a home directory."
fi
# set path names to ease maintance
SPLITFILEDIR=scratch
......@@ -714,9 +706,8 @@ if [ ! -d "$WORKDIR/$SPLITFILEDIR" ]; then
mkdir -p "$WORKDIR/output" || exit 1;
cd "$WORKDIR"
echo "executing scratch generator on $FASTA ($_arg_splitup_per_queryfile entries per file)"
eval "${SCRIPT_PATH}/splitter.py $FASTA $_arg_splitup_per_queryfile" & # splitup queryfile
PID=$!
queue $PID
"${SCRIPT_PATH}/splitter.py $FASTA $_arg_splitup_per_queryfile" & # splitup queryfile
queue $!
fi
# wait until the copy and a possible scratch generation are finished
......@@ -725,6 +716,13 @@ while [[ ! -z "$(echo $QUEUE| tr -d ' ')" ]]; do
sleep 5
done
DATABASE=$(find $RAMDISK -name "*${DBSUFFIX}" -print -quit)
#DATABASE=$RAMDISK/db${DBSUFFIX} #/$(basename $DATABASEPATH)
if [[ -z $DATABASE ]]; then
error "Unable to recognize database, please get in touch with hpc@uni-mainz.de"
exit 1
fi
cd "$WORKDIR"
# calculating the degree of parallelism is necessary in order not to oversaturate with srun processes.
......@@ -733,15 +731,6 @@ if [[ -z "$SLURM_CPUS_PER_TASK" ]]; then
declare -i SLURM_CPUS_PER_TASK=1
fi
if [[ -z $DATABASE ]]; then
error "Unable to recognize database, please get in touch with hpc@uni-mainz.de"
fi
# see whether we find a file in the db
tmp=$(find $DATABASE -type f -print -quit)
# remove the 2nd suffix
DATABASE=${tmp%.*}
### a temporary script to conduct the alignment
cmdfile=/localscratch/$SLURM_JOB_ID/dummy.sh
cmdfilewriter
......@@ -754,6 +743,12 @@ sbcast $cmdfile $newcmd
rm $cmdfile
cmdfile=$newcmd
echo "command file:"
cat $newcmd
echo
ls /localscratch/$SLURM_JOBID/ramdisk
### append a finishing token to the samples
samples+=('done')
......@@ -764,7 +759,6 @@ $parallel "$srun" "$cmdfile" ::: $(find $(pwd) -type f -name 'group*.fasta')
wait
set -x
n_unfinished_files=$(comm -3 <(cd output && find .| grep -o '[0-9]*' |sort ) <(cd scratch && find . | grep -o '[0-9]*' |sort )|wc -l)
if [ $n_unfinished_files -eq 0 ] && [[ $_arg_compress == "on" ]] && [ $XMLOUT -eq 1 ]; then
# shrink the alloction, such that only the minimum necessary is accounted for
......@@ -801,6 +795,7 @@ elif [ $n_unfinished_files -eq 0 ] && [[ $_arg_compress == "on" ]] && [ $OUTOUT
# write anything to the output file
for split_file in ./output/group_*gz; do
zcat $split_file >> $outfile
rm $split_file
done
pigz -p 16 $outfile &
ENDC=$(date +%s.%N)
......
......@@ -4,7 +4,10 @@ function cmdfilewriter()
cat <<EOF > $cmdfile
#!/bin/bash
module purge
#TODO: find a solution for the bug in BLAST+ AND to select the version by hand
module load bio/BLAST+/2.9.0-gompi-2019a
#module load bio/BLAST+/2.7.1-foss-2018a
# are we done?
source ${SCRIPT_PATH}/cleanup.sh
if [ \$1 = "done" ]; then
......@@ -16,15 +19,22 @@ cat <<EOF > $cmdfile
tmp_out=${JOBDIR}/\$outfname
trap "rm -f \$tmp_out" EXIT
START_BLAST=\$(date +%s)
$BLASTEXE -num_threads $SLURM_CPUS_PER_TASK -db $DATABASE $BLASTPARAMS -query \$1 -out \$tmp_out
set -x
$_arg_executable -num_threads $SLURM_CPUS_PER_TASK -db $DATABASE $BLASTPARAMS -query \$1 -out \$tmp_out
set +x
success=\$?
END_BLAST=\$(date +%s)
elapsed=\$(bc <<< "scale=1; \$((\$END_BLAST - \$START_BLAST))/60")
echo "Elapsed: \$elapsed"
# compress, when done
gzip \$tmp_out
# copy back, when ready
mv \${tmp_out}.gz ./output/\${outfname}.gz
# only proceed, when ready
if [ \$success -eq 0 ]; then
# compress, when done
gzip \$tmp_out
# copy back, when ready
mv \${tmp_out}.gz ./output/\${outfname}.gz
fi
# we only consider the blast exit code for the total exit code
exit \$success
EOF
}
function stage_in_writer() {
cat <<EOF > $stagefile
#!/bin/bash
target=/localscratch/$SLURM_JOB_ID/ramdisk
cd \$target
for fname in \$(find ${DATABASEPATH} -type f ); do
#suffix=\${fname#*.}
outfile=\$(basename \${fname})
cp -L \$fname \$outfile
done
cd -
EOF
}
......@@ -18,7 +18,7 @@
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE
# ARE DISCLAIMED. IN NO EVENT SHALL CHRISTIAN MEESTERS OR THE JGU MAINZ BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
......@@ -30,7 +30,7 @@
module purge
# load the most current version of GNU parallel
module load tools/parallel/20181122
module load tools/parallel
# make the return value of the last pipe command which fails the return value
set -o pipefail
......
......@@ -31,6 +31,7 @@ cat <<EOF > $cmdfile
shrink_job
exit
fi
module purge
module load bio/STAR # most current
# STAR just needs an outfile prefix - juchuuu
......
......@@ -25,6 +25,7 @@ cat <<EOF > $cmdfile
shrink_job
exit
fi
module purge
module load bio/Bowtie2/2.3.4.3-foss-2018a
module load bio/SAMtools/1.9
#reference=${reference%%.*} # only provide the prefix + path!
......
......@@ -17,6 +17,7 @@ cmdfilewriter()
{
cat <<EOF > $cmdfile
#!/bin/bash
module purge
module load bio/BWA/0.7.17
module load bio/SAMtools
# source the error reporter script
......
......@@ -24,6 +24,7 @@ cmdfilewriter()
{
cat <<EOF > $cmdfile
#!/bin/bash
module purge
module load bio/SeqAn
# source the error reporter script
source ${SCRIPT_PATH}/errors.sh
......
......@@ -534,7 +534,7 @@ if [ $_arg_paired -eq 1 ]; then
if [[ ${samples[0]} == *"_R1"* || ${samples[0]} == *"_R2"* ]]; then
first='_R1'
second='_R2'
elif [[ ${samples[0]} == *"_1"* ]]; then
elif [[ ${samples[0]} == *"_1"* ]] || [[ ${samples[0]} == *"_2"* ]]; then
first='_1'
second='_2'
else
......