Skip to content

Instantly share code, notes, and snippets.

@aaronsaunders
Created October 27, 2013 20:25
Show Gist options
  • Save aaronsaunders/7187482 to your computer and use it in GitHub Desktop.
Save aaronsaunders/7187482 to your computer and use it in GitHub Desktop.
file parsing and length plotting
"""
Mappings from file extensions to biopython types.
Copied from Erik Matsens seqmagick https://github.com/fhcrc/seqmagick/
"""
import argparse
import contextlib
import copy
import functools
import os
import os.path
import signal
import sys
import tempfile
import bz2
import gzip
import os.path
import sys
# Define mappings in a dictionary with extension : BioPython_file_type.
EXTENSION_TO_TYPE = {'.aln': 'clustal',
'.afa': 'fasta',
'.fa': 'fasta',
'.faa': 'fasta',
'.fas': 'fasta',
'.fasta': 'fasta',
'.fastq': 'fastq',
'.fq': 'fastq',
'.ffn': 'fasta',
'.fna': 'fasta',
'.frn': 'fasta',
'.gb': 'genbank',
'.gbk': 'genbank',
'.needle': 'emboss',
'.nex': 'nexus',
'.phy': 'phylip',
'.phylip': 'phylip',
'.phyx': 'phylip-relaxed',
'.qual': 'qual',
'.sff': 'sff-trim',
'.sth': 'stockholm',
'.sto': 'stockholm',}
COMPRESS_EXT = {'.bz2': bz2.BZ2File, '.gz': gzip.open, '.bz': bz2.BZ2File}
class UnknownExtensionError(ValueError):
pass
def from_extension(extension):
"""
Look up the BioPython file type corresponding with input extension.
Look up is case insensitive.
"""
if not extension.startswith('.'):
raise ValueError("Extensions must begin with a period.")
try:
return EXTENSION_TO_TYPE[extension.lower()]
except KeyError:
raise UnknownExtensionError("seqmagick does not know how to handle " +
"files with extensions like this: " + extension)
def from_filename(file_name):
"""
Look up the BioPython file type corresponding to an input file name.
"""
base, extension = os.path.splitext(file_name)
if extension in COMPRESS_EXT:
# Compressed file
extension = os.path.splitext(base)[1]
return from_extension(extension)
def from_handle(fh, stream_default='fasta'):
"""
Look up the BioPython file type corresponding to a file-like object.
For stdin, stdout, and stderr, ``stream_default`` is used.
"""
if fh in (sys.stdin, sys.stdout, sys.stderr):
return stream_default
return from_filename(fh.name)
class FileType(object):
"""
Near clone of argparse.FileType, supporting gzip and bzip
"""
def __init__(self, mode='r'):
self.mode = mode
self.ext_map = COMPRESS_EXT.copy()
def _get_handle(self, file_path):
ext = os.path.splitext(file_path)[1].lower()
return self.ext_map.get(ext, open)(file_path, self.mode)
def __call__(self, string):
if string == '-':
if 'r' in self.mode:
return sys.stdin
elif 'w' in self.mode:
return sys.stdout
else:
raise ValueError("Invalid mode: {0}".format(string))
else:
return self._get_handle(string)
from Bio.SeqIO import parse
from toolbox import biofileformat
import matplotlib.pyplot as plt
import sys
from collections import Counter
import fileinput
def get_lengths(fname, sub=False):
"""
Parses a sequence file and returns a list of sequence lengths
"""
with biofileformat.FileType('rb')(fname) as fh:
seqformat = biofileformat.from_handle(fh)
okformats = [ "fasta", "fastq" ]
if seqformat not in okformats:
print "takes only fasta/fastq w/wo compression"
return
if sub:
lengths = []
for n, rec in enumerate(parse(fh, seqformat)):
sizes.append(len(rec))
if n == (sub - 1):
break
else:
lengths = [len(rec) for rec in parse(fh, seqformat)]
return lengths
def draw_histogram(lengths):
"""
Parses a sequence file and returns a plot of sequence lengths.
Optional argument to subset the file.
"""
plt.hist(lengths)
plt.title("%s\n%i sequences, range %i to %i bp" \
% (fname, len(lengths), min(lengths),max(lengths)))
plt.xlabel("Sequence length (bp)")
plt.ylabel("Count")
plt.show()
return
def plot_seqlength(fname, sub=False):
draw_histogram(get_lengths(fname, sub=False))
return
def main(args):
usage = "Usage: plot_lengths.py infile"
if args[0] == "-h":
print usage
sys.exit()
for fname in args:
lengths = get_lengths(fname)
counts = Counter()
for l in lengths:
counts[l] += 1
outlines = [ '{}\t{}'.format( key, counts[key])
for key in sorted(counts.keys()) ]
sys.stdout.write('length\tcount\n')
sys.stdout.write('\n'.join(outlines))
sys.stdout.write('\n')
if __name__ == '__main__':
main(sys.argv[1:])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment