...
 
Commits (7)
......@@ -19,7 +19,8 @@ unpack_sources = False
files_to_copy = ['LA_Wrapper',
'cleanup.sh',
'blast_wrap.sh',
'splitter.py']
'splitter.py',
'stage_in.sh']
postinstallcmds = ['mv %(installdir)s/parallel_BLAST/* %(installdir)s && rmdir %(installdir)s/parallel_BLAST']
......
......@@ -53,7 +53,7 @@ module load bio/BLAST+/2.9.0-gompi-2019a
### setup variable for THIS script; giving absolute path if necessary
SCRIPT="$0"
SCRIPT_VERSION="0.5"
SCRIPT_VERSION="0.5.1"
# TODO: delete the following 3 functions, once sbcast is working
function queue {
......@@ -99,6 +99,14 @@ begins_with_short_option()
test "$all_short_options" = "${all_short_options/$first_option/}" && return 1 || return 0
}
check_nucleotide_db() {
echo "check not yet implemented"
}
check_protein_db() {
echo "check not yet implemented"
}
# function to redirect simple error messages to stderr
error() {
(>&2 echo "ERROR: $1")
......@@ -217,6 +225,7 @@ credits()
echo " - faster stage-in for reference data"
echo " - automerge for -outfmt=6"
echo " - -outfmt=6 is now the default"
echo "- v0.5.1 -- 29. Aug. 2019 -- numerous fixes"
echo
echo "Current version is: $SCRIPT_VERSION"
echo
......@@ -453,6 +462,7 @@ fi
if [ "blastx" = "${_arg_executable,,}" ]; then
executable="blastx"
threads=2
DBSUFFIX=".nal" # db suffix to be removed from nal file - not working, if nal file not present
if [ -z "$SLURM_JOB_ID" ]; then
source $(dirname "$0")/blast_wrap.sh
else
......@@ -461,6 +471,7 @@ if [ "blastx" = "${_arg_executable,,}" ]; then
elif [ "blastn" = "${_arg_executable,,}" ]; then
executable="blastn"
threads=8
#check_nucleotide_db
if [ -z "$SLURM_JOB_ID" ]; then
source $(dirname "$0")/blast_wrap.sh
else
......@@ -469,6 +480,7 @@ elif [ "blastn" = "${_arg_executable,,}" ]; then
elif [ "blastp" = "${_arg_executable,,}" ]; then
executable="blastp"
threads=2
#check_protein_db
if [ -z "$SLURM_JOB_ID" ]; then
source $(dirname "$0")/blast_wrap.sh
else
......@@ -518,11 +530,14 @@ if [[ "$_arg_blastparams" =~ "outfmt" ]]; then
else
BLASTPARAMS="${_arg_blastparams} $DEFAULT_BLASTPARAMS"
fi
### testing for output options - to be used later
# test whether the output is xml or not
if [[ '-outfmt 5' =~ "$BLASTPARAMS" ]]; then
if [[ '-outfmt 5' =~ $(echo -e "${BLASTPARAMS}" | sed -e 's/^[[:space:]]*//') ]]; then
XMLOUT=1
OUTOUT=0
elif [[ '-outfmt 6' =~ "$BLASTPARAMS" ]]; then
# test whether the output is plain tabular or not
elif [[ '-outfmt 6' =~ $(echo -e "${BLASTPARAMS}" | sed -e 's/^[[:space:]]*//') ]]; then
XMLOUT=0
OUTOUT=1
fi
......@@ -568,12 +583,7 @@ if [[ ! $FASTAPATH == /* ]]; then
FASTAPATH="$PWD/$FASTAPATH";
fi
#if [[ ! $DATABASEPATH == /* ]]; then
# DATABASEPATH="$PWD/$DATABASEPATH";
#fi
FASTA="$FASTAPATH/$FASTAID"
#DATABASE="$DATABASEPATH/$DATABASEID"
### setup blast and splitup executable; check if exist
allowed_executables="blastx blastp blastn"
......@@ -622,12 +632,11 @@ else
fi
fi
# how many entries are there in the FASTA file?
### how many entries are there in the FASTA file?
nentries=$(grep '>' $FASTA | wc -l)
# we try to set the split number to a value, which ensures an output of
# ~ 10.000 split files
### check if this script is on node by checking env-variable $SLURM_JOB_ID, else send it to SLURM with given parameters and exit
if [ -z "$SLURM_JOB_ID" ]; then
......@@ -705,8 +714,9 @@ if [ ! -d "$WORKDIR/$SPLITFILEDIR" ]; then
mkdir -p "$WORKDIR/$SPLITFILEDIR" || exit 1;
mkdir -p "$WORKDIR/output" || exit 1;
cd "$WORKDIR"
pip install biopython
echo "executing scratch generator on $FASTA ($_arg_splitup_per_queryfile entries per file)"
"${SCRIPT_PATH}/splitter.py $FASTA $_arg_splitup_per_queryfile" & # splitup queryfile
eval "${SCRIPT_PATH}/splitter.py $FASTA $_arg_splitup_per_queryfile " & # splitup queryfile
queue $!
fi
......@@ -716,8 +726,12 @@ while [[ ! -z "$(echo $QUEUE| tr -d ' ')" ]]; do
sleep 5
done
DATABASE=$(find $RAMDISK -name "*${DBSUFFIX}" -print -quit)
#DATABASE=$RAMDISK/db${DBSUFFIX} #/$(basename $DATABASEPATH)
DATABASE=$(find $RAMDISK -type f -print -quit)
# strip suffix - if more than one dot
while [ $(grep -o "\." <<< $DATABASE | wc -l) -gt 1 ]; do
DATABASE=${DATABASE%.*}
done
if [[ -z $DATABASE ]]; then
error "Unable to recognize database, please get in touch with hpc@uni-mainz.de"
exit 1
......@@ -743,23 +757,24 @@ sbcast $cmdfile $newcmd
rm $cmdfile
cmdfile=$newcmd
echo "command file:"
cat $newcmd
echo
ls /localscratch/$SLURM_JOBID/ramdisk
samples=$(find $(pwd) -type f -name 'group*.fasta')
### append a finishing token to the samples
samples+=('done')
parallel="parallel --no-notice -j $SLURM_NTASKS -P $SLURM_NTASKS "
srun="srun --cpu-bind=q --mem-bind=q -n 1 -N1 --exclusive -c $SLURM_CPUS_PER_TASK --jobid $SLURM_JOBID --mem-per-cpu=$((SLURM_MEM_PER_NODE / SLURM_CPUS_ON_NODE))"
$parallel "$srun" "$cmdfile" ::: $(find $(pwd) -type f -name 'group*.fasta')
$parallel "$srun" "$cmdfile" ::: $samples
echo before wait
echo $n_unfinished_files $_arg_compress $OUTOUT
wait
echo after wait
echo $n_unfinished_files $_arg_compress $OUTOUT
n_unfinished_files=$(comm -3 <(cd output && find .| grep -o '[0-9]*' |sort ) <(cd scratch && find . | grep -o '[0-9]*' |sort )|wc -l)
echo after setting
echo $n_unfinished_files $_arg_compress $OUTOUT
if [ $n_unfinished_files -eq 0 ] && [[ $_arg_compress == "on" ]] && [ $XMLOUT -eq 1 ]; then
# shrink the alloction, such that only the minimum necessary is accounted for
#scontrol update job=$SLURM_JOB_ID NumNodes=1
......@@ -777,38 +792,33 @@ if [ $n_unfinished_files -eq 0 ] && [[ $_arg_compress == "on" ]] && [ $XMLOUT -e
# extract footer information and write to outfile
zcat $some_file | tail -n3 >> $outfile
pigz -p 16 $outfile &
ENDC=$(date +%s.%N)
elapsedc=$(bc <<< "scale=1; (($ENDC-$STARTC))/60")
rm -rf $WORKDIR/$SPLITFILEDIR &
#rm ./output/group_*.xml &
#rm -rf ./scratch &
rm ./output/group_*.xml &
wait
ENDC=$(date +%s.%N)
elapsedc=$(bc <<< "scale=1; (($ENDC-$STARTC))/60")
# merge all standard output files (outfmt -6 -- tabular output) files
elif [ $n_unfinished_files -eq 0 ] && [[ $_arg_compress == "on" ]] && [ $OUTOUT -eq 1 ]; then
# shrink the alloction, such that only the minimum necessary is accounted for
#scontrol update job=$SLURM_JOB_ID NumNodes=1
pwd
# merge all xml files
STARTC=$(date +%s.%N)
outfile="${JOBTAG}.out"
# select the first of all files
some_file=$(find ./output -name 'group*' | head -n1)
# write anything to the output file
for split_file in ./output/group_*gz; do
zcat $split_file >> $outfile
rm $split_file
done
pigz -p 16 $outfile &
ENDC=$(date +%s.%N)
elapsedc=$(bc <<< "scale=1; (($ENDC-$STARTC))/60")
rm -rf $WORKDIR/$SPLITFILEDIR &
rmdir ./output &
wait
ENDC=$(date +%s.%N)
elapsedc=$(bc <<< "scale=1; (($ENDC-$STARTC))/60")
fi
# marks the end of this run
END=$(date +%s.%N)
elapsed=$(bc <<< "scale=1; (($END-$START))/60")
echo "parallel_BLAST took $elapsed minutes to run"
echo "parallel_BLAST took $elapsed minutes to run; the compression took $elapsedc minutes"
# TODO: Check: 1 output item per input scratch file?
# TODO: If not re-submit with correct/adjusted job size
......@@ -19,13 +19,11 @@ cat <<EOF > $cmdfile
tmp_out=${JOBDIR}/\$outfname
trap "rm -f \$tmp_out" EXIT
START_BLAST=\$(date +%s)
set -x
$_arg_executable -num_threads $SLURM_CPUS_PER_TASK -db $DATABASE $BLASTPARAMS -query \$1 -out \$tmp_out
set +x
success=\$?
END_BLAST=\$(date +%s)
elapsed=\$(bc <<< "scale=1; \$((\$END_BLAST - \$START_BLAST))/60")
echo "Elapsed: \$elapsed"
#echo "Elapsed for '\$1': \$elapsed"
# only proceed, when ready
if [ \$success -eq 0 ]; then
......
#!/usr/bin/env python
# dummy line to introduce a line break
import pip
# will take little time, if dependency is already satisfied
pip.main(['install', 'biopython'])
from Bio import SeqIO
import sys
import os
......@@ -22,7 +17,7 @@ batch = list()
for pos, entry in enumerate(record_iter):
if pos == 0:
group += 1
filename = 'group_%5d.fasta' % group
filename = 'group_%05d.fasta' % group
handle = open(os.path.join('scratch', filename), 'w')
if (pos % nlines == 0 and pos != 0):
count = SeqIO.write(batch, handle, 'fasta')
......@@ -30,7 +25,7 @@ for pos, entry in enumerate(record_iter):
handle.close()
batch = list()
group += 1
filename = 'group_%s.fasta' % group
filename = 'group_%05d.fasta' % group
handle = open(os.path.join('scratch', filename), 'w')
batch.append(entry)
# take care of the rest
......
......@@ -7,9 +7,10 @@ cd \$target
for fname in \$(find ${DATABASEPATH} -type f ); do
#suffix=\${fname#*.}
outfile=\$(basename \${fname})
cp -L \$fname \$outfile
cp -L \$fname \$outfile &
done
cd -
wait
EOF
}
......