diff --git a/blast/parallel_BLAST/LA_Wrapper b/blast/parallel_BLAST/LA_Wrapper index 31c4e9ea163b4b40b4fad9a24e03a8f5d8eee135..12520c8509cdf690aa175181f299341054617f7a 100755 --- a/blast/parallel_BLAST/LA_Wrapper +++ b/blast/parallel_BLAST/LA_Wrapper @@ -43,7 +43,13 @@ module purge # load the most current version of GNU parallel module load tools/parallel -module load lang/Python/3.6.4-foss-2018a +#module load lang/Python/3.6.4-foss-2018a +module load lang/Python/3.7.4-GCCcore-8.3.0 +#TODO: find a solution for the bug in BLAST+ AND to select the version by hand +module load bio/BLAST+/2.9.0-gompi-2019a +#module load bio/BLAST+/2.7.1-foss-2018a + + ### setup variable for THIS script; giving absolute path if necessary SCRIPT="$0" @@ -433,6 +439,16 @@ done FASTA=$_arg_fasta DATABASE=$_arg_database +### checking db for integrity +if [ -e ${DATABASE}*.fa ]; then + DBSUFFIX=".fa" +elif [ -e ${DATABASE}*.fasta ]; then + DBSUFFIX=".fasta" +else + error "no file '.fa' or '.fasta' found in ${DATABASE} - unable to proceed reliably" + exit 1 +fi + ### check if query & database exist if [[ $_arg_test == "off" ]] && [ ! -e "$FASTA" ]; then error "FASTA input was: '$FASTA' - no such file!" @@ -545,7 +561,7 @@ if [ -z "$SLURM_JOB_ID" ] && [[ $_arg_test == "off" ]]; then echo "removing directory $JOBTAG" rm -r $JOBTAG else - echo "So you want to continue regardless? (e.g. scratch files already existing) ([y]/n)" + echo "So, you want to continue regardless (using the existing scratch files)? ([y]/n)" echo -n '>' read ENTER if [[ ${ENTER,,} = 'n' || ${ENTER,,} == 'no' ]] ; then @@ -572,11 +588,13 @@ FASTA="$FASTAPATH/$FASTAID" ### setup blast and splitup executable; check if exist allowed_executables="blastx blastp blastn" -if [[ ! $allowed_executables =~ (^|[[:space:]])"$_arg_executable"($|[[:space:]]) ]]; then -# BLASTEXE=$(which $_arg_executable) -#else +if [[ ! $allowed_executables =~ (^| [[:space:]])"$_arg_executable"($| ) ]]; then + BLASTEXE=$(which $_arg_executable) +else error "$_arg_executable ought to be one of [$allowed_executables]" + exit 1 fi +export _arg_executable ### which is the reference directory size? _arg_ramdisk=$(du -shL --block-size=1M "$_arg_database" | cut -f1 )M @@ -665,40 +683,30 @@ RAMDISK=$JOBDIR/ramdisk HOSTLIST=$(scontrol show hostname $SLURM_JOB_NODELIST | paste -d, -s | tr ',', ' ') QUEUE='' +#myhost=$(hostname -f) +stagefile=/localscratch/$SLURM_JOB_ID/dummy_stagein.sh +rstagefile=/localscratch/$SLURM_JOB_ID/stagein.sh +source "${SCRIPT_PATH}"/stage_in.sh +stage_in_writer +chmod +x $stagefile + +# distribute the stagewriter +sbcast $stagefile $rstagefile + +rm $stagefile +stagefile=$rstagefile +# we would not need this loop with regard to slurm, but as we have +# asynchronous tasks already, we keep track with the queue for HOST in $HOSTLIST; do - if [ -L ${DATABASEPATH} ]; then - warning "If the reference directory is a link, fast stage-in is not possible." - for fname in ${DATABASEPATH}/*; do - eval "ssh $HOST cp -L $fname ${RAMDISK}/$(basename $fname)" & - PID=$! - queue $PID - done - else - for fname in ${DATABASEPATH}/*; do - if [ -L "$fname" ]; then - eval "ssh $HOST cp -L $fname ${RAMDISK}/$(basename $fname)" & - PID=$! - queue $PID - else - eval "ssh $HOST dd bs=4096 if=$fname of=${RAMDISK}/$(basename $fname)" & - PID=$! - queue $PID - fi - done - fi - # TODO: check for dereferencing links, before enabling - # TODO: check for performance, before re-enabling - #sbcast $FILE $RAMDISK/$(basename $FILE) + srun -w $HOST -N1 -n1 -c1 --mem-per-cpu=5000M $stagefile & + queue $! done -#DATABASE=$RAMDISK/$DATABASE -DATABASE=$RAMDISK #/$(basename $DATABASEPATH) - WORKDIR=$PWD/$BLASTDIR/$SLURM_JOB_NAME # this script may never output to a user's $HOME if [[ *"$WORKDIR"* = 'home' ]]; then - eror "Cowardly refusing to operate in a home directory." + error "Cowardly refusing to operate in a home directory." fi # set path names to ease maintance SPLITFILEDIR=scratch @@ -709,9 +717,8 @@ if [ ! -d "$WORKDIR/$SPLITFILEDIR" ]; then mkdir -p "$WORKDIR/output" || exit 1; cd "$WORKDIR" echo "executing scratch generator on $FASTA ($_arg_splitup_per_queryfile entries per file)" - eval "${SCRIPT_PATH}/splitter.py $FASTA $_arg_splitup_per_queryfile" & # splitup queryfile - PID=$! - queue $PID + "${SCRIPT_PATH}/splitter.py $FASTA $_arg_splitup_per_queryfile" & # splitup queryfile + queue $! fi # wait until the copy and a possible scratch generation are finished @@ -720,6 +727,16 @@ while [[ ! -z "$(echo $QUEUE| tr -d ' ')" ]]; do sleep 5 done +set -x +ls $RAMDISK/* +DATABASE=$(find $RAMDISK -name "*${DBSUFFIX}" -print -quit) +set +x +#DATABASE=$RAMDISK/db${DBSUFFIX} #/$(basename $DATABASEPATH) +if [[ -z $DATABASE ]]; then + error "Unable to recognize database, please get in touch with hpc@uni-mainz.de" + exit 1 +fi + cd "$WORKDIR" # calculating the degree of parallelism is necessary in order not to oversaturate with srun processes. @@ -728,15 +745,6 @@ if [[ -z "$SLURM_CPUS_PER_TASK" ]]; then declare -i SLURM_CPUS_PER_TASK=1 fi -if [[ -z $DATABASE ]]; then - error "Unable to recognize database, please get in touch with hpc@uni-mainz.de" -fi - -# see whether we find a file in the db -tmp=$(find $DATABASE -type f -print -quit) -# remove the 2nd suffix -DATABASE=${tmp%.*} - ### a temporary script to conduct the alignment cmdfile=/localscratch/$SLURM_JOB_ID/dummy.sh cmdfilewriter @@ -749,6 +757,12 @@ sbcast $cmdfile $newcmd rm $cmdfile cmdfile=$newcmd +echo "command file:" +cat $newcmd +echo +ls /localscratch/$SLURM_JOBID/ramdisk + + ### append a finishing token to the samples samples+=('done')