Created
June 7, 2023 19:28
-
-
Save aofarrel/fa1fd24cdfafb387e53ba177fa05317b to your computer and use it in GitHub Desktop.
Pulled from the folder of a run of a decontamination task (the first one listed here: https://github.com/aofarrel/clockwork-wdl/blob/main/tasks/combined_decontamination.wdl) which ran on Terra. The file is known as "script" with no extension, but has a bash shebang.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
cd /cromwell_root | |
tmpDir=$(mkdir -p "/cromwell_root/tmp.65940862" && echo "/cromwell_root/tmp.65940862") | |
chmod 777 "$tmpDir" | |
export _JAVA_OPTIONS=-Djava.io.tmpdir="$tmpDir" | |
export TMPDIR="$tmpDir" | |
export HOME="$HOME" | |
( | |
cd /cromwell_root | |
) | |
out91ebffb0="${tmpDir}/out.$$" err91ebffb0="${tmpDir}/err.$$" | |
mkfifo "$out91ebffb0" "$err91ebffb0" | |
trap 'rm "$out91ebffb0" "$err91ebffb0"' EXIT | |
touch '/cromwell_root/stdout' '/cromwell_root/stderr' | |
tee '/cromwell_root/stdout' < "$out91ebffb0" & | |
tee '/cromwell_root/stderr' < "$err91ebffb0" >&2 & | |
( | |
cd /cromwell_root | |
READS_FILES_UNSORTED=("/cromwell_root/topmed_workflow_testing/tb/sra/ERR7057565_1.fastq" "/cromwell_root/topmed_workflow_testing/tb/sra/ERR7057565_2.fastq") | |
# make sure reads are paired correctly | |
# | |
# clockwork map_reads seems to require each pair of fqs are consecutive, such as: | |
# (SRR1_1.fq, SRR1_2.fq, SRR2_1.fq, SRR2_2.fq) | |
# If you instead had (SRR1_1.fq, SRR2_1.fq, SRR1_2.fq, SRR2_2.fq) then fqcount would | |
# fail assuming SRR1 and SRR2 have different read counts. Interestingly, this was | |
# never an issue when downloading reads via SRANWRP, but to better support direct | |
# input of reads, this sort of hack is necessary. | |
readarray -t READS_FILES < <(for fq in "${READS_FILES_UNSORTED[@]}"; do echo "$fq"; done | sort) | |
# downsample, if necessary | |
# | |
# Downsampling relies on deleting inputs and then putting a new file where the the old | |
# input was. This works on Terra, but there is a chance this gets iffy elsewhere. | |
# If you're having issues with miniwdl, --copy-input-files might help | |
if [[ "450" != "-1" ]] | |
then | |
for inputfq in "${READS_FILES[@]}" | |
do | |
size_inputfq=$(du -m "$inputfq" | cut -f1) | |
# shellcheck disable=SC2004 | |
# just trust me on this one | |
if (( $size_inputfq > 450 )) | |
then | |
seqtk sample -s1965 "$inputfq" 1000000 > temp.fq | |
rm "$inputfq" | |
mv temp.fq "$inputfq" | |
echo "WARNING: downsampled $inputfq (was $size_inputfq MB)" | |
fi | |
done | |
fi | |
# Terra-Cromwell does not place you in the home dir, but rather one folder down, so we have | |
# to go up one to get the ref genome. miniwdl goes further. Who knows what other executors do. | |
# The tar command will however place the untarred directory in the workdir. | |
tar -xvf /ref/Ref.remove_contam.tar | |
# anticipate bad fastqs | |
# | |
# This is a hack to make sure the check_this_fastq task output is defined iff this | |
# WDL task fails. The duplicate will be deleted if we decontam successfully. We use | |
# copies of the inputs WDL gets iffy when trying to glob on optionals, and because | |
# deleting inputs is wonky on some backends (understandably!) | |
for inputfq in "${READS_FILES[@]}" | |
do | |
cp "$inputfq" "ERR7057565_1.fastq_dcntmfail.fastq" | |
done | |
# map reads for decontamination | |
timeout -v 0m clockwork map_reads \ | |
--unsorted_sam \ | |
\ | |
ERR7057565 \ | |
Ref.remove_contam/ref.fa \ | |
ERR7057565.sam \ | |
"${READS_FILES[@]}" | |
exit=$? | |
if [[ $exit = 124 ]] | |
then | |
echo "ERROR -- clockwork map_reads timed out" | |
if [[ "false" = "true" ]] | |
then | |
set -eux -o pipefail | |
exit 1 | |
else | |
exit 0 | |
fi | |
elif [[ $exit = 137 ]] | |
then | |
echo "ERROR -- clockwork map_reads was killed -- it may have run out of memory" | |
if [[ "false" = "true" ]] | |
then | |
set -eux -o pipefail | |
exit 1 | |
else | |
exit 0 | |
fi | |
elif [[ $exit = 0 ]] | |
then | |
echo "Reads successfully mapped to decontamination reference" | |
elif [[ $exit = 1 ]] | |
then | |
echo "ERROR -- clockwork map_reads errored out for unknown reasons" | |
set -eux -o pipefail | |
exit 1 | |
else | |
echo "ERROR -- clockwork map_reads returned $exit for unknown reasons" | |
set -eux -o pipefail | |
exit 1 | |
fi | |
echo "************ removing contamination *****************" | |
# calculate the last three positional arguments of the rm_contam task | |
if [[ ! "" = "" ]] | |
then | |
arg_counts_out="" | |
else | |
arg_counts_out="ERR7057565.decontam.counts.tsv" | |
fi | |
arg_reads_out1="ERR7057565_1.decontam.fq.gz" | |
arg_reads_out2="ERR7057565_2.decontam.fq.gz" | |
# TODO: this doesn't seem to be in the nextflow version of this pipeline, but it seems | |
# we need it in the WDL version? | |
# https://github.com/iqbal-lab-org/clockwork/issues/77 | |
# https://github.com/iqbal-lab-org/clockwork/blob/v0.11.3/python/clockwork/contam_remover.py#L170 | |
# | |
# This might intereact with unsorted_sam, which seems to actually be a dupe remover | |
# https://github.com/iqbal-lab-org/clockwork/blob/v0.11.3/python/clockwork/tasks/map_reads.py#L18 | |
# https://github.com/iqbal-lab-org/clockwork/blob/v0.11.3/python/clockwork/read_map.py#L26 | |
samtools sort -n ERR7057565.sam > sorted_by_read_name_ERR7057565.sam | |
# One of remove_contam's tasks will throw a warning about index files. Ignore it. | |
# https://github.com/mhammell-laboratory/TEtranscripts/issues/99 | |
timeout -v 0m clockwork remove_contam \ | |
Ref.remove_contam/remove_contam_metadata.tsv \ | |
sorted_by_read_name_ERR7057565.sam \ | |
$arg_counts_out \ | |
$arg_reads_out1 \ | |
$arg_reads_out2 \ | |
\ | |
\ | |
exit=$? | |
if [[ $exit = 124 ]] | |
then | |
echo "ERROR -- clockwork remove_contam timed out" | |
if [[ "false" = "true" ]] | |
then | |
set -eux -o pipefail | |
exit 1 | |
else | |
exit 0 | |
fi | |
elif [[ $exit = 137 ]] | |
then | |
echo "ERROR -- clockwork remove_contam was killed -- it may have run out of memory" | |
if [[ "false" = "true" ]] | |
then | |
set -eux -o pipefail | |
exit 1 | |
else | |
exit 0 | |
fi | |
elif [[ $exit = 0 ]] | |
then | |
echo "Reads successfully decontaminated" | |
elif [[ $exit = 1 ]] | |
then | |
echo "ERROR -- clockwork remove_contam errored out for unknown reasons" | |
set -eux -o pipefail | |
exit 1 | |
else | |
echo "ERROR -- clockwork remove_contam returned $exit for unknown reasons" | |
set -eux -o pipefail | |
exit 1 | |
fi | |
# We passed, so delete the output that would signal to run fastqc | |
rm "ERR7057565_1.fastq_dcntmfail.fastq" | |
echo "Decontamination completed." | |
if [[ ! "true" = "true" ]] | |
then | |
ls -lha > workdir.txt | |
tar -c "*/" > "ERR7057565.tar" | |
fi | |
) > "$out91ebffb0" 2> "$err91ebffb0" | |
echo $? > /cromwell_root/rc.tmp | |
( | |
# add a .file in every empty directory to facilitate directory delocalization on the cloud | |
cd /cromwell_root | |
find . -type d -exec sh -c '[ -z "$(ls -A '"'"'{}'"'"')" ] && touch '"'"'{}'"'"'/.file' \; | |
) | |
( | |
cd /cromwell_root | |
sync | |
) | |
mv /cromwell_root/rc.tmp /cromwell_root/rc |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment