Last active
April 17, 2018 12:57
-
-
Save walterst/96de484cdcfb13c9bed3fa27d8576dce to your computer and use it in GitHub Desktop.
Very simple fastq parser/checker to try and detect errors. assumes lines will be exactly (@Label, sequence, +, quality scores). Checks for expected chars at label/optional label, equal length of seq/qual.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# Used to find fastq seqs in gzipped files, write first error, if any, to a log file | |
# Usage: python find_fastq_errors.py fastq_folder log_file | |
# where fastq_folder has all of the fastq files in it-will search subdirectories | |
from sys import argv | |
from glob import glob | |
import gzip | |
import os | |
output_log = open(argv[2], "w") | |
fastq_files = [] | |
fastq_files.extend(glob(argv[1] + "/*.fastq.gz") + glob(argv[1] + "/*.fastq")) | |
for root,dirs,files in os.walk(argv[1]): | |
for curr_dir in dirs: | |
fastq_files.extend(glob(curr_dir + "/*.fastq.gz") + glob(curr_dir + "/*.fastq")) | |
# Expected to be 4 line per sequence, no empty lines, this is limited parser | |
for curr_file in fastq_files: | |
if curr_file.endswith('.gz'): | |
query_reads = gzip.open(curr_file, "rb") | |
else: | |
query_reads = open(curr_file, "U") | |
keep_reading = True | |
error_data = "" | |
while keep_reading: | |
label = query_reads.readline().strip() | |
if(label==""): # hit end of file | |
keep_reading = False | |
break | |
if not label.startswith("@"): | |
error_data += "Found read label without @: %s\n" % label | |
seq = query_reads.readline().strip() | |
opt_label = query_reads.readline().strip() | |
if not opt_label.startswith("+"): | |
error_data += "Found optional read label without +: %s\n" % opt_label | |
qual = query_reads.readline().strip() | |
if len(seq) != len(qual): | |
error_data += "Found seq and qual of unequal lengths: \n%s\n%s" % (seq, qual) | |
if error_data: | |
break | |
if error_data: | |
output_log.write("%s\n%s\n" % (curr_file, error_data)) | |
query_reads.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment