diff --git a/blast/parallel_BLAST/LA_Wrapper b/blast/parallel_BLAST/LA_Wrapper index bec96fe052b0b9a14ca9d0c76909ed1a1e3cd325..199b3017ad0d1e4324b6398604d5c37df189c046 100755 --- a/blast/parallel_BLAST/LA_Wrapper +++ b/blast/parallel_BLAST/LA_Wrapper @@ -99,6 +99,14 @@ begins_with_short_option() test "$all_short_options" = "${all_short_options/$first_option/}" && return 1 || return 0 } +check_nucleotide_db() { + echo "check not yet implemented" +} + +check_protein_db() { + echo "check not yet implemented" +} + # function to redirect simple error messages to stderr error() { (>&2 echo "ERROR: $1") @@ -453,6 +461,7 @@ fi if [ "blastx" = "${_arg_executable,,}" ]; then executable="blastx" threads=2 + DBSUFFIX=".nal" # db suffix to be removed from nal file - not working, if nal file not present if [ -z "$SLURM_JOB_ID" ]; then source $(dirname "$0")/blast_wrap.sh else @@ -461,6 +470,7 @@ if [ "blastx" = "${_arg_executable,,}" ]; then elif [ "blastn" = "${_arg_executable,,}" ]; then executable="blastn" threads=8 + #check_nucleotide_db if [ -z "$SLURM_JOB_ID" ]; then source $(dirname "$0")/blast_wrap.sh else @@ -469,6 +479,7 @@ elif [ "blastn" = "${_arg_executable,,}" ]; then elif [ "blastp" = "${_arg_executable,,}" ]; then executable="blastp" threads=2 + #check_protein_db if [ -z "$SLURM_JOB_ID" ]; then source $(dirname "$0")/blast_wrap.sh else @@ -518,11 +529,14 @@ if [[ "$_arg_blastparams" =~ "outfmt" ]]; then else BLASTPARAMS="${_arg_blastparams} $DEFAULT_BLASTPARAMS" fi + +### testing for output options - to be used later # test whether the output is xml or not -if [[ '-outfmt 5' =~ "$BLASTPARAMS" ]]; then +if [[ '-outfmt 5' =~ $(echo -e "${BLASTPARAMS}" | sed -e 's/^[[:space:]]*//') ]]; then XMLOUT=1 OUTOUT=0 -elif [[ '-outfmt 6' =~ "$BLASTPARAMS" ]]; then +# test whether the output is plain tabular or not +elif [[ '-outfmt 6' =~ $(echo -e "${BLASTPARAMS}" | sed -e 's/^[[:space:]]*//') ]]; then XMLOUT=0 OUTOUT=1 fi @@ -568,12 +582,7 @@ if [[ ! $FASTAPATH == /* ]]; then FASTAPATH="$PWD/$FASTAPATH"; fi -#if [[ ! $DATABASEPATH == /* ]]; then -# DATABASEPATH="$PWD/$DATABASEPATH"; -#fi - FASTA="$FASTAPATH/$FASTAID" -#DATABASE="$DATABASEPATH/$DATABASEID" ### setup blast and splitup executable; check if exist allowed_executables="blastx blastp blastn" @@ -622,12 +631,11 @@ else fi fi -# how many entries are there in the FASTA file? +### how many entries are there in the FASTA file? nentries=$(grep '>' $FASTA | wc -l) # we try to set the split number to a value, which ensures an output of # ~ 10.000 split files - ### check if this script is on node by checking env-variable $SLURM_JOB_ID, else send it to SLURM with given parameters and exit if [ -z "$SLURM_JOB_ID" ]; then @@ -705,8 +713,9 @@ if [ ! -d "$WORKDIR/$SPLITFILEDIR" ]; then mkdir -p "$WORKDIR/$SPLITFILEDIR" || exit 1; mkdir -p "$WORKDIR/output" || exit 1; cd "$WORKDIR" + pip install biopython echo "executing scratch generator on $FASTA ($_arg_splitup_per_queryfile entries per file)" - "${SCRIPT_PATH}/splitter.py $FASTA $_arg_splitup_per_queryfile" & # splitup queryfile + eval "${SCRIPT_PATH}/splitter.py $FASTA $_arg_splitup_per_queryfile " & # splitup queryfile queue $! fi @@ -716,8 +725,12 @@ while [[ ! -z "$(echo $QUEUE| tr -d ' ')" ]]; do sleep 5 done -DATABASE=$(find $RAMDISK -name "*${DBSUFFIX}" -print -quit) -#DATABASE=$RAMDISK/db${DBSUFFIX} #/$(basename $DATABASEPATH) +DATABASE=$(find $RAMDISK -type f -print -quit) +# strip suffix - if more than one dot +while [ $(grep -o "\." <<< $DATABASE | wc -l) -gt 1 ]; do + DATABASE=${DATABASE%.*} +done + if [[ -z $DATABASE ]]; then error "Unable to recognize database, please get in touch with hpc@uni-mainz.de" exit 1 @@ -743,23 +756,26 @@ sbcast $cmdfile $newcmd rm $cmdfile cmdfile=$newcmd -echo "command file:" -cat $newcmd -echo -ls /localscratch/$SLURM_JOBID/ramdisk - - +samples=$(find $(pwd) -type f -name 'group*.fasta') ### append a finishing token to the samples samples+=('done') parallel="parallel --no-notice -j $SLURM_NTASKS -P $SLURM_NTASKS " srun="srun --cpu-bind=q --mem-bind=q -n 1 -N1 --exclusive -c $SLURM_CPUS_PER_TASK --jobid $SLURM_JOBID --mem-per-cpu=$((SLURM_MEM_PER_NODE / SLURM_CPUS_ON_NODE))" -$parallel "$srun" "$cmdfile" ::: $(find $(pwd) -type f -name 'group*.fasta') +set -x +$parallel "$srun" "$cmdfile" ::: $samples +echo before wait +echo $n_unfinished_files $_arg_compress $OUTOUT wait +echo after wait +echo $n_unfinished_files $_arg_compress $OUTOUT +set -x n_unfinished_files=$(comm -3 <(cd output && find .| grep -o '[0-9]*' |sort ) <(cd scratch && find . | grep -o '[0-9]*' |sort )|wc -l) +echo after setting +echo $n_unfinished_files $_arg_compress $OUTOUT if [ $n_unfinished_files -eq 0 ] && [[ $_arg_compress == "on" ]] && [ $XMLOUT -eq 1 ]; then # shrink the alloction, such that only the minimum necessary is accounted for #scontrol update job=$SLURM_JOB_ID NumNodes=1 @@ -777,38 +793,33 @@ if [ $n_unfinished_files -eq 0 ] && [[ $_arg_compress == "on" ]] && [ $XMLOUT -e # extract footer information and write to outfile zcat $some_file | tail -n3 >> $outfile pigz -p 16 $outfile & - ENDC=$(date +%s.%N) - elapsedc=$(bc <<< "scale=1; (($ENDC-$STARTC))/60") rm -rf $WORKDIR/$SPLITFILEDIR & - #rm ./output/group_*.xml & - #rm -rf ./scratch & + rm ./output/group_*.xml & wait + ENDC=$(date +%s.%N) + elapsedc=$(bc <<< "scale=1; (($ENDC-$STARTC))/60") +# merge all standard output files (outfmt -6 -- tabular output) files elif [ $n_unfinished_files -eq 0 ] && [[ $_arg_compress == "on" ]] && [ $OUTOUT -eq 1 ]; then - # shrink the alloction, such that only the minimum necessary is accounted for - #scontrol update job=$SLURM_JOB_ID NumNodes=1 - pwd - # merge all xml files STARTC=$(date +%s.%N) outfile="${JOBTAG}.out" - # select the first of all files - some_file=$(find ./output -name 'group*' | head -n1) # write anything to the output file for split_file in ./output/group_*gz; do zcat $split_file >> $outfile rm $split_file done pigz -p 16 $outfile & - ENDC=$(date +%s.%N) - elapsedc=$(bc <<< "scale=1; (($ENDC-$STARTC))/60") rm -rf $WORKDIR/$SPLITFILEDIR & + rmdir ./output & wait + ENDC=$(date +%s.%N) + elapsedc=$(bc <<< "scale=1; (($ENDC-$STARTC))/60") fi # marks the end of this run END=$(date +%s.%N) elapsed=$(bc <<< "scale=1; (($END-$START))/60") -echo "parallel_BLAST took $elapsed minutes to run" +echo "parallel_BLAST took $elapsed minutes to run; the compression took $elapsedc minutes" # TODO: Check: 1 output item per input scratch file? # TODO: If not re-submit with correct/adjusted job size