Skip to content

Instantly share code, notes, and snippets.

@brantfaircloth
Last active March 11, 2020 14:54
Show Gist options
  • Save brantfaircloth/7ffbc6a8501c85d05f03 to your computer and use it in GitHub Desktop.
Save brantfaircloth/7ffbc6a8501c85d05f03 to your computer and use it in GitHub Desktop.
split (gzipped) fastq files to temp files quickly in python
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
(c) 2014 Brant Faircloth || http://faircloth-lab.org/
All rights reserved.
This code is distributed under a 3-clause BSD license. Please see
LICENSE.txt for more information.
Created on 08 October 2014 22:07 CDT (-0500)
"""
import os
import gzip
import itertools
import tempfile
import multiprocessing
#import pdb
def is_gzip(filename):
magic_bytes = "\x1f\x8b\x08"
with open(filename) as infile:
file_start = infile.read(len(magic_bytes))
if file_start.startswith(magic_bytes):
return True
return False
def split(filename):
name = os.path.splitext(os.path.basename(filename))[0] + "-"
if is_gzip(filename):
with gzip.open(filename, 'rb') as infile:
reads = itertools.izip_longest(*[infile]*4)
tempfiles = write(reads, name, gzip=True)
else:
with open(filename, 'rU') as infile:
reads = itertools.izip_longest(*[infile]*4)
tempfiles = write(reads, name, gzip=False)
return tempfiles
def batches(reads):
while True:
batchiter = itertools.islice(reads, 10000)
yield itertools.chain([batchiter.next()], batchiter)
def write(reads, name, gzip=True):
tempfiles = []
for cnt, batch in enumerate(batches(reads)):
if gzip:
with tempfile.NamedTemporaryFile(mode="w+b", delete=False, prefix=name, suffix=".gz") as outfile:
gz = gzip.GzipFile(mode="wb", fileobj=outfile)
[gz.write("{}{}{}{}".format(*seq)) for seq in batch]
tempfiles.append(outfile.name)
else:
with tempfile.NamedTemporaryFile(mode="w", delete=False, prefix=name, suffix=".fastq") as outfile:
[outfile.write("{}{}{}{}".format(*seq)) for seq in batch]
tempfiles.append(outfile.name)
return tempfiles
# fastq => fastq files: 100k parsed on SSD in 0.364 sec total
r1 = "R1-100k.fastq"
r2 = "R2-100k.fastq"
# fastq.gz => fastq.gz files: 100k parsed on SSD in 31.442 sec total
#r1 = "R1-100k.fastq.gz"
#r2 = "R2-100k.fastq.gz"
# IO-bound, but still => speedup
p = multiprocessing.Pool(2)
tempfiles = p.map(split, [r1,r2])
# re-pair split R1 with R2 mates
combos = zip(tempfiles[0], tempfiles[1])
# cleanup
[os.remove(f) for f in tempfiles[0]]
[os.remove(f) for f in tempfiles[1]]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment