Skip to content

Instantly share code, notes, and snippets.

@isaacovercast
Created September 11, 2018 15:45
Show Gist options
  • Save isaacovercast/01a3c76ccdcd197bee0bac6524ab0081 to your computer and use it in GitHub Desktop.
Save isaacovercast/01a3c76ccdcd197bee0bac6524ab0081 to your computer and use it in GitHub Desktop.
def dadi_to_momi(infile, outfile=None, verbose=False):
if outfile == None:
outfile = infile.split(".sfs")[0] + "_momi.sfs"
dat = open(infile).readlines()
## Get rid of comment lines
dat = [x.strip() for x in dat if not x.startswith("#")]
if not len(dat) == 3:
raise Exception("Malformed dadi sfs {}.\n Must have 3 lines, yours has {}".format(infile, len(dat)))
## Parse the info line into nsamps per pop (list of ints), folding flag, and pop names list (if they are given)
info = dat[0].split()
nsamps = []
## Keep carving off values as long as they cast successfully as int
for i in info:
try:
nsamps.append(int(i))
except:
pass
nsamps = np.array(nsamps)
pops = [x.replace('"', '') for x in info[len(nsamps)+1:]]
folded = info[len(nsamps)]
folded = False if "un" in folded else True
if not len(pops) == len(nsamps):
print("Number of populations doesn't agree with number of pop names, using generic names.")
pops = ["pop"+x for x in range(len(nsamps))]
if verbose: print("Info nsamps={} folded={} pops={}".format(nsamps, folded, pops))
## Get mask
mask = list(map(int, dat[2].split()))
if verbose: print(mask)
## Get sfs, and reshape based on sample sizes
sfs = list(map(float, dat[1].split()))
if verbose: print(sfs)
length = np.ma.array(sfs, mask=mask).sum()
sfs = np.array(sfs).reshape(nsamps)
if verbose: print("length {}".format(length))
if verbose: print(sfs)
## Get counts per sfs bin
counts = Counter()
for sfsbin in product(*[range(y) for y in [x for x in nsamps]]):
## Ignore monomorphic snps
## nsamps minus 1 here because of the off by one diff between number
## of bins in the sfs and actual number of samples
if sfsbin == tuple(nsamps-1) or sfsbin == tuple([0] * len(nsamps)):
continue
## ignore zero bin counts
if sfs[sfsbin] == 0:
continue
if verbose: print(sfsbin, sfs[sfsbin]),
counts.update({sfsbin:sfs[sfsbin]})
if verbose: print("nbins {}".format(len(counts)))
## Convert SFS bin style into momi config style
configs = pd.DataFrame(index=range(len(counts)), columns=pops)
locus_info = []
for i, c in enumerate(counts):
## (n-1) here because nbins in dadi sfs is n+1
cfg = np.array([[(n-1)-x, x] for x,n in zip(c, nsamps)])
configs.iloc[i] = [list(map(int, list(x))) for x in cfg]
locus_info.append([0, i, counts[c]])
if verbose: print("n_snps {}".format(np.sum([x[2] for x in locus_info])))
## Finally build the momi style sfs dictionary and write it out
momi_sfs = {"sampled_pops":pops,
"folded":folded,
"length":int(length),
"configs":configs.values.tolist(),
"(locus,config_id,count)":locus_info}
with open(outfile, 'w') as out:
out.write("{}".format(json.dumps(momi_sfs)))
## make it pretty
sfs = momi.Sfs.load(outfile)
## Fold if unfolded
if folded: sfs = sfs.fold()
sfs.dump(outfile)
#dadi_to_momi(infile, verbose=True)
@ldutoit
Copy link

ldutoit commented Jun 23, 2019

Hi Isaac, I am having a look at your great little piece of code in order to transform a dadi sfs to a momi one that I can input in momi.Sfs.load(). That is what I am looking at I believe? I am really grateful for your code but I am not managing to make it work because I am not sure which module to load and which version of python you are running. I believe Counter() comes from collections but at line 42 you got prouct and I am not sure where to find it. Could you help me? Thank you loads! Ludovic Dutoit

@isaacovercast
Copy link
Author

isaacovercast commented Jun 23, 2019 via email

@ldutoit
Copy link

ldutoit commented Jun 23, 2019

Hi Isaac, that is great! I could not track product() as it did not exist in python 2 (unlike Counter()). Thank you for Open Science and great to know it is in Momi. Ludo

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment