Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
What's new
10
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in / Register
Toggle navigation
Open sidebar
HPC - JGU - Life Sciences
seq-analysis
Commits
142c65a1
Commit
142c65a1
authored
Aug 29, 2019
by
Christian Meesters
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
corrected various minor glitches: sample finding and sanity checks
parent
c219169b
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
43 additions
and
32 deletions
+43
-32
blast/parallel_BLAST/LA_Wrapper
blast/parallel_BLAST/LA_Wrapper
+43
-32
No files found.
blast/parallel_BLAST/LA_Wrapper
View file @
142c65a1
...
...
@@ -99,6 +99,14 @@ begins_with_short_option()
test
"
$all_short_options
"
=
"
${
all_short_options
/
$first_option
/
}
"
&&
return
1
||
return
0
}
check_nucleotide_db
()
{
echo
"check not yet implemented"
}
check_protein_db
()
{
echo
"check not yet implemented"
}
# function to redirect simple error messages to stderr
error
()
{
(>
&2
echo
"ERROR:
$1
"
)
...
...
@@ -453,6 +461,7 @@ fi
if
[
"blastx"
=
"
${
_arg_executable
,,
}
"
]
;
then
executable
=
"blastx"
threads
=
2
DBSUFFIX
=
".nal"
# db suffix to be removed from nal file - not working, if nal file not present
if
[
-z
"
$SLURM_JOB_ID
"
]
;
then
source
$(
dirname
"
$0
"
)
/blast_wrap.sh
else
...
...
@@ -461,6 +470,7 @@ if [ "blastx" = "${_arg_executable,,}" ]; then
elif
[
"blastn"
=
"
${
_arg_executable
,,
}
"
]
;
then
executable
=
"blastn"
threads
=
8
#check_nucleotide_db
if
[
-z
"
$SLURM_JOB_ID
"
]
;
then
source
$(
dirname
"
$0
"
)
/blast_wrap.sh
else
...
...
@@ -469,6 +479,7 @@ elif [ "blastn" = "${_arg_executable,,}" ]; then
elif
[
"blastp"
=
"
${
_arg_executable
,,
}
"
]
;
then
executable
=
"blastp"
threads
=
2
#check_protein_db
if
[
-z
"
$SLURM_JOB_ID
"
]
;
then
source
$(
dirname
"
$0
"
)
/blast_wrap.sh
else
...
...
@@ -518,11 +529,14 @@ if [[ "$_arg_blastparams" =~ "outfmt" ]]; then
else
BLASTPARAMS
=
"
${
_arg_blastparams
}
$DEFAULT_BLASTPARAMS
"
fi
### testing for output options - to be used later
# test whether the output is xml or not
if
[[
'-outfmt 5'
=
~
"
$BLASTPARAMS
"
]]
;
then
if
[[
'-outfmt 5'
=
~
$(
echo
-e
"
$
{
BLASTPARAMS
}
"
|
sed
-e
's/^[[:space:]]*//'
)
]]
;
then
XMLOUT
=
1
OUTOUT
=
0
elif
[[
'-outfmt 6'
=
~
"
$BLASTPARAMS
"
]]
;
then
# test whether the output is plain tabular or not
elif
[[
'-outfmt 6'
=
~
$(
echo
-e
"
${
BLASTPARAMS
}
"
|
sed
-e
's/^[[:space:]]*//'
)
]]
;
then
XMLOUT
=
0
OUTOUT
=
1
fi
...
...
@@ -568,12 +582,7 @@ if [[ ! $FASTAPATH == /* ]]; then
FASTAPATH
=
"
$PWD
/
$FASTAPATH
"
;
fi
#if [[ ! $DATABASEPATH == /* ]]; then
# DATABASEPATH="$PWD/$DATABASEPATH";
#fi
FASTA
=
"
$FASTAPATH
/
$FASTAID
"
#DATABASE="$DATABASEPATH/$DATABASEID"
### setup blast and splitup executable; check if exist
allowed_executables
=
"blastx blastp blastn"
...
...
@@ -622,12 +631,11 @@ else
fi
fi
# how many entries are there in the FASTA file?
##
# how many entries are there in the FASTA file?
nentries
=
$(
grep
'>'
$FASTA
|
wc
-l
)
# we try to set the split number to a value, which ensures an output of
# ~ 10.000 split files
### check if this script is on node by checking env-variable $SLURM_JOB_ID, else send it to SLURM with given parameters and exit
if
[
-z
"
$SLURM_JOB_ID
"
]
;
then
...
...
@@ -705,8 +713,9 @@ if [ ! -d "$WORKDIR/$SPLITFILEDIR" ]; then
mkdir
-p
"
$WORKDIR
/
$SPLITFILEDIR
"
||
exit
1
;
mkdir
-p
"
$WORKDIR
/output"
||
exit
1
;
cd
"
$WORKDIR
"
pip
install
biopython
echo
"executing scratch generator on
$FASTA
(
$_arg_splitup_per_queryfile
entries per file)"
"
${
SCRIPT_PATH
}
/splitter.py
$FASTA
$_arg_splitup_per_queryfile
"
&
# splitup queryfile
eval
"
${
SCRIPT_PATH
}
/splitter.py
$FASTA
$_arg_splitup_per_queryfile
"
&
# splitup queryfile
queue
$!
fi
...
...
@@ -716,8 +725,12 @@ while [[ ! -z "$(echo $QUEUE| tr -d ' ')" ]]; do
sleep
5
done
DATABASE
=
$(
find
$RAMDISK
-name
"*
${
DBSUFFIX
}
"
-print
-quit
)
#DATABASE=$RAMDISK/db${DBSUFFIX} #/$(basename $DATABASEPATH)
DATABASE
=
$(
find
$RAMDISK
-type
f
-print
-quit
)
# strip suffix - if more than one dot
while
[
$(
grep
-o
"
\.
"
<<<
$DATABASE
|
wc
-l
)
-gt
1
]
;
do
DATABASE
=
${
DATABASE
%.*
}
done
if
[[
-z
$DATABASE
]]
;
then
error
"Unable to recognize database, please get in touch with hpc@uni-mainz.de"
exit
1
...
...
@@ -743,23 +756,26 @@ sbcast $cmdfile $newcmd
rm
$cmdfile
cmdfile
=
$newcmd
echo
"command file:"
cat
$newcmd
echo
ls
/localscratch/
$SLURM_JOBID
/ramdisk
samples
=
$(
find
$(
pwd
)
-type
f
-name
'group*.fasta'
)
### append a finishing token to the samples
samples+
=(
'done'
)
parallel
=
"parallel --no-notice -j
$SLURM_NTASKS
-P
$SLURM_NTASKS
"
srun
=
"srun --cpu-bind=q --mem-bind=q -n 1 -N1 --exclusive -c
$SLURM_CPUS_PER_TASK
--jobid
$SLURM_JOBID
--mem-per-cpu=
$((
SLURM_MEM_PER_NODE
/
SLURM_CPUS_ON_NODE
))
"
$parallel
"
$srun
"
"
$cmdfile
"
:::
$(
find
$(
pwd
)
-type
f
-name
'group*.fasta'
)
set
-x
$parallel
"
$srun
"
"
$cmdfile
"
:::
$samples
echo
before
wait
echo
$n_unfinished_files
$_arg_compress
$OUTOUT
wait
echo
after
wait
echo
$n_unfinished_files
$_arg_compress
$OUTOUT
set
-x
n_unfinished_files
=
$(
comm
-3
<
(
cd
output
&&
find .|
grep
-o
'[0-9]*'
|sort
)
<
(
cd
scratch
&&
find
.
|
grep
-o
'[0-9]*'
|sort
)
|wc
-l
)
echo
after setting
echo
$n_unfinished_files
$_arg_compress
$OUTOUT
if
[
$n_unfinished_files
-eq
0
]
&&
[[
$_arg_compress
==
"on"
]]
&&
[
$XMLOUT
-eq
1
]
;
then
# shrink the alloction, such that only the minimum necessary is accounted for
#scontrol update job=$SLURM_JOB_ID NumNodes=1
...
...
@@ -777,38 +793,33 @@ if [ $n_unfinished_files -eq 0 ] && [[ $_arg_compress == "on" ]] && [ $XMLOUT -e
# extract footer information and write to outfile
zcat
$some_file
|
tail
-n3
>>
$outfile
pigz
-p
16
$outfile
&
ENDC
=
$(
date
+%s.%N
)
elapsedc
=
$(
bc
<<<
"scale=1; ((
$ENDC
-
$STARTC
))/60"
)
rm
-rf
$WORKDIR
/
$SPLITFILEDIR
&
#rm ./output/group_*.xml &
#rm -rf ./scratch &
rm
./output/group_
*
.xml &
wait
ENDC
=
$(
date
+%s.%N
)
elapsedc
=
$(
bc
<<<
"scale=1; ((
$ENDC
-
$STARTC
))/60"
)
# merge all standard output files (outfmt -6 -- tabular output) files
elif
[
$n_unfinished_files
-eq
0
]
&&
[[
$_arg_compress
==
"on"
]]
&&
[
$OUTOUT
-eq
1
]
;
then
# shrink the alloction, such that only the minimum necessary is accounted for
#scontrol update job=$SLURM_JOB_ID NumNodes=1
pwd
# merge all xml files
STARTC
=
$(
date
+%s.%N
)
outfile
=
"
${
JOBTAG
}
.out"
# select the first of all files
some_file
=
$(
find ./output
-name
'group*'
|
head
-n1
)
# write anything to the output file
for
split_file
in
./output/group_
*
gz
;
do
zcat
$split_file
>>
$outfile
rm
$split_file
done
pigz
-p
16
$outfile
&
ENDC
=
$(
date
+%s.%N
)
elapsedc
=
$(
bc
<<<
"scale=1; ((
$ENDC
-
$STARTC
))/60"
)
rm
-rf
$WORKDIR
/
$SPLITFILEDIR
&
rmdir
./output &
wait
ENDC
=
$(
date
+%s.%N
)
elapsedc
=
$(
bc
<<<
"scale=1; ((
$ENDC
-
$STARTC
))/60"
)
fi
# marks the end of this run
END
=
$(
date
+%s.%N
)
elapsed
=
$(
bc
<<<
"scale=1; ((
$END
-
$START
))/60"
)
echo
"parallel_BLAST took
$elapsed
minutes to run"
echo
"parallel_BLAST took
$elapsed
minutes to run
; the compression took
$elapsedc
minutes
"
# TODO: Check: 1 output item per input scratch file?
# TODO: If not re-submit with correct/adjusted job size
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment