Skip to content

Instantly share code, notes, and snippets.

@marcelm
Last active March 29, 2019 08:56
Show Gist options
  • Save marcelm/ab659840e0e83ef1fc131d64f507f947 to your computer and use it in GitHub Desktop.
Save marcelm/ab659840e0e83ef1fc131d64f507f947 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
"""
Mask low-quality bases in a FASTQ file with 'N'.
Adjust cutoff_front and cutoff_back below to use
different thresholds (currently: 20 at 5' end,
0 at 3' end).
Usage:
python3 qualmask.py input.fastq.gz > output.fastq
Or with compression:
python3 qualmask.py input.fastq.gz | gzip > output.fastq.gz
"""
from __future__ import print_function
from cutadapt.qualtrim import quality_trim_index
from cutadapt.seqio import open as openseq
import sys
masked = 0
with openseq(sys.argv[1]) as reader:
with openseq(sys.stdout, mode='w', fileformat='fastq') as writer:
for read in reader:
# (start, stop) describes where the good-quality part of the read is
start, stop = quality_trim_index(read.qualities, cutoff_front=20, cutoff_back=0)
read.sequence = 'N' * start + read.sequence[start:stop] + 'N' * (len(read.sequence) - stop)
masked += stop - start
writer.write(read)
print('Masked', masked, 'bases', file=sys.stderr)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment