Skip to content

Instantly share code, notes, and snippets.

@mgalardini
Forked from rchikhi/estimate-insert-sizes
Last active March 10, 2017 09:46
Show Gist options
  • Save mgalardini/20ec8bec22ae6e16e29ded4c1f050839 to your computer and use it in GitHub Desktop.
Save mgalardini/20ec8bec22ae6e16e29ded4c1f050839 to your computer and use it in GitHub Desktop.
Quickly estimates insert sizes of read datasets, given some sequence(s) they can be mapped to. Requires BWA. Short usage: <reference> <*.fastq>
#!/usr/bin/env python
doc = """
Quickly estimates insert sizes of read datasets, given some sequence(s) they can be mapped to.
Author: Rayan Chikhi
short usage: <reference> <*.fastq>
example:
estimate-insert-sizes contigs.fa readsA_1.fq readsA_2.fq readsB_1.fq readsB_2.fq
or, with shell globbing:
estimate-insert-sizes contigs.fa *.fq
special case, a single argument is interpreted as interleaved pairs:
estimate-insert-sizes contigs.fa interleaved.fq
"""
""" technical note:
by default, bwa will be executed with "-t X" to read X*100kbp sequences, instead of just 100kbp.
100kbp is not enough in my experience to detect insert sizes.
incidentally, bwa will use X threads, even if the cpu has less cores than that.
X can be changed by modifying this variable:
"""
nb_threads = 5
# --------
from glob import glob
import subprocess
import sys, os
if len(sys.argv) < 3:
exit(doc)
reference = sys.argv[1]
reads = sorted(sys.argv[2:])
try:
subprocess.call(["bwa"], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
except:
exit("Please make sure that the `bwa` binary is in your $PATH")
for read in reads:
if not os.path.isfile(read):
exit("Error: %s does not exist" % read)
if len(reads) == 1:
print("Assuming that %s is interleaved" % reads[0])
reads += [""]
if not os.path.isfile(reference+".sa"):
print("Creating index file..")
subprocess.call(["bwa", "index", reference], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
def parse_list(line, nb_elts):
# specific to BWA-MEM stderr format
return map(lambda x: int(float(x)), ' '.join(line.strip().replace(',','').split()[-nb_elts:])[1:-1].split())
stats = dict()
for read1, read2 in zip(reads[::2],reads[1::2]):
print("Processing: \n %s \n %s " % (read1,read2))
cmd = ["bwa", "mem"] + (["-p"] if read2 == "" else []) + ["-t %d" % nb_threads, reference, read1, read2]
DEVNULL = open(os.devnull, 'wb')
process = subprocess.Popen(cmd, stdout=DEVNULL, stderr=subprocess.PIPE)
seen_candidate_line = False
while True:
line = process.stderr.readline()
if line == '' and process.poll() != None:
break
if "worker" in line:
break
if "pestat" not in line:
continue
if "candidate unique pairs for" in line:
if seen_candidate_line:
break
seen_candidate_line = True
nb_pairs = parse_list(line,4)
for i in xrange(4):
stats[['FF', 'FR', 'RF', 'RR'][i]] = { 'nb_pairs' : nb_pairs[i] }
if "orientation" in line:
orientation = line.strip().split()[-1].replace('.','')
if "mem_pestat] mean and std.dev:" in line:
mean, stdev = parse_list(line,2)
stats[orientation]['mean'] = mean
stats[orientation]['stdev'] = stdev
if orientation == 'RR':
# stats are complete
break
sys.stdout.write(line)
sys.stdout.flush()
if process.poll() is None:
process.terminate()
results = sorted(stats.items(), key = lambda x: x[1]['nb_pairs'], reverse=True)
most_likely = results[0]
mean = most_likely[1]['mean']
stdev = most_likely[1]['stdev']
print("Orientation", most_likely[0], "mean", mean, "stdev", stdev)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment