Skip to content

Instantly share code, notes, and snippets.

@aofarrel
Created June 7, 2023 19:28
Show Gist options
  • Save aofarrel/fa1fd24cdfafb387e53ba177fa05317b to your computer and use it in GitHub Desktop.
Save aofarrel/fa1fd24cdfafb387e53ba177fa05317b to your computer and use it in GitHub Desktop.
Pulled from the folder of a run of a decontamination task (the first one listed here: https://github.com/aofarrel/clockwork-wdl/blob/main/tasks/combined_decontamination.wdl) which ran on Terra. The file is known as "script" with no extension, but has a bash shebang.
#!/bin/bash
cd /cromwell_root
tmpDir=$(mkdir -p "/cromwell_root/tmp.65940862" && echo "/cromwell_root/tmp.65940862")
chmod 777 "$tmpDir"
export _JAVA_OPTIONS=-Djava.io.tmpdir="$tmpDir"
export TMPDIR="$tmpDir"
export HOME="$HOME"
(
cd /cromwell_root
)
out91ebffb0="${tmpDir}/out.$$" err91ebffb0="${tmpDir}/err.$$"
mkfifo "$out91ebffb0" "$err91ebffb0"
trap 'rm "$out91ebffb0" "$err91ebffb0"' EXIT
touch '/cromwell_root/stdout' '/cromwell_root/stderr'
tee '/cromwell_root/stdout' < "$out91ebffb0" &
tee '/cromwell_root/stderr' < "$err91ebffb0" >&2 &
(
cd /cromwell_root
READS_FILES_UNSORTED=("/cromwell_root/topmed_workflow_testing/tb/sra/ERR7057565_1.fastq" "/cromwell_root/topmed_workflow_testing/tb/sra/ERR7057565_2.fastq")
# make sure reads are paired correctly
#
# clockwork map_reads seems to require each pair of fqs are consecutive, such as:
# (SRR1_1.fq, SRR1_2.fq, SRR2_1.fq, SRR2_2.fq)
# If you instead had (SRR1_1.fq, SRR2_1.fq, SRR1_2.fq, SRR2_2.fq) then fqcount would
# fail assuming SRR1 and SRR2 have different read counts. Interestingly, this was
# never an issue when downloading reads via SRANWRP, but to better support direct
# input of reads, this sort of hack is necessary.
readarray -t READS_FILES < <(for fq in "${READS_FILES_UNSORTED[@]}"; do echo "$fq"; done | sort)
# downsample, if necessary
#
# Downsampling relies on deleting inputs and then putting a new file where the the old
# input was. This works on Terra, but there is a chance this gets iffy elsewhere.
# If you're having issues with miniwdl, --copy-input-files might help
if [[ "450" != "-1" ]]
then
for inputfq in "${READS_FILES[@]}"
do
size_inputfq=$(du -m "$inputfq" | cut -f1)
# shellcheck disable=SC2004
# just trust me on this one
if (( $size_inputfq > 450 ))
then
seqtk sample -s1965 "$inputfq" 1000000 > temp.fq
rm "$inputfq"
mv temp.fq "$inputfq"
echo "WARNING: downsampled $inputfq (was $size_inputfq MB)"
fi
done
fi
# Terra-Cromwell does not place you in the home dir, but rather one folder down, so we have
# to go up one to get the ref genome. miniwdl goes further. Who knows what other executors do.
# The tar command will however place the untarred directory in the workdir.
tar -xvf /ref/Ref.remove_contam.tar
# anticipate bad fastqs
#
# This is a hack to make sure the check_this_fastq task output is defined iff this
# WDL task fails. The duplicate will be deleted if we decontam successfully. We use
# copies of the inputs WDL gets iffy when trying to glob on optionals, and because
# deleting inputs is wonky on some backends (understandably!)
for inputfq in "${READS_FILES[@]}"
do
cp "$inputfq" "ERR7057565_1.fastq_dcntmfail.fastq"
done
# map reads for decontamination
timeout -v 0m clockwork map_reads \
--unsorted_sam \
\
ERR7057565 \
Ref.remove_contam/ref.fa \
ERR7057565.sam \
"${READS_FILES[@]}"
exit=$?
if [[ $exit = 124 ]]
then
echo "ERROR -- clockwork map_reads timed out"
if [[ "false" = "true" ]]
then
set -eux -o pipefail
exit 1
else
exit 0
fi
elif [[ $exit = 137 ]]
then
echo "ERROR -- clockwork map_reads was killed -- it may have run out of memory"
if [[ "false" = "true" ]]
then
set -eux -o pipefail
exit 1
else
exit 0
fi
elif [[ $exit = 0 ]]
then
echo "Reads successfully mapped to decontamination reference"
elif [[ $exit = 1 ]]
then
echo "ERROR -- clockwork map_reads errored out for unknown reasons"
set -eux -o pipefail
exit 1
else
echo "ERROR -- clockwork map_reads returned $exit for unknown reasons"
set -eux -o pipefail
exit 1
fi
echo "************ removing contamination *****************"
# calculate the last three positional arguments of the rm_contam task
if [[ ! "" = "" ]]
then
arg_counts_out=""
else
arg_counts_out="ERR7057565.decontam.counts.tsv"
fi
arg_reads_out1="ERR7057565_1.decontam.fq.gz"
arg_reads_out2="ERR7057565_2.decontam.fq.gz"
# TODO: this doesn't seem to be in the nextflow version of this pipeline, but it seems
# we need it in the WDL version?
# https://github.com/iqbal-lab-org/clockwork/issues/77
# https://github.com/iqbal-lab-org/clockwork/blob/v0.11.3/python/clockwork/contam_remover.py#L170
#
# This might intereact with unsorted_sam, which seems to actually be a dupe remover
# https://github.com/iqbal-lab-org/clockwork/blob/v0.11.3/python/clockwork/tasks/map_reads.py#L18
# https://github.com/iqbal-lab-org/clockwork/blob/v0.11.3/python/clockwork/read_map.py#L26
samtools sort -n ERR7057565.sam > sorted_by_read_name_ERR7057565.sam
# One of remove_contam's tasks will throw a warning about index files. Ignore it.
# https://github.com/mhammell-laboratory/TEtranscripts/issues/99
timeout -v 0m clockwork remove_contam \
Ref.remove_contam/remove_contam_metadata.tsv \
sorted_by_read_name_ERR7057565.sam \
$arg_counts_out \
$arg_reads_out1 \
$arg_reads_out2 \
\
\
exit=$?
if [[ $exit = 124 ]]
then
echo "ERROR -- clockwork remove_contam timed out"
if [[ "false" = "true" ]]
then
set -eux -o pipefail
exit 1
else
exit 0
fi
elif [[ $exit = 137 ]]
then
echo "ERROR -- clockwork remove_contam was killed -- it may have run out of memory"
if [[ "false" = "true" ]]
then
set -eux -o pipefail
exit 1
else
exit 0
fi
elif [[ $exit = 0 ]]
then
echo "Reads successfully decontaminated"
elif [[ $exit = 1 ]]
then
echo "ERROR -- clockwork remove_contam errored out for unknown reasons"
set -eux -o pipefail
exit 1
else
echo "ERROR -- clockwork remove_contam returned $exit for unknown reasons"
set -eux -o pipefail
exit 1
fi
# We passed, so delete the output that would signal to run fastqc
rm "ERR7057565_1.fastq_dcntmfail.fastq"
echo "Decontamination completed."
if [[ ! "true" = "true" ]]
then
ls -lha > workdir.txt
tar -c "*/" > "ERR7057565.tar"
fi
) > "$out91ebffb0" 2> "$err91ebffb0"
echo $? > /cromwell_root/rc.tmp
(
# add a .file in every empty directory to facilitate directory delocalization on the cloud
cd /cromwell_root
find . -type d -exec sh -c '[ -z "$(ls -A '"'"'{}'"'"')" ] && touch '"'"'{}'"'"'/.file' \;
)
(
cd /cromwell_root
sync
)
mv /cromwell_root/rc.tmp /cromwell_root/rc
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment