Created
August 13, 2020 16:15
-
-
Save bredelings/4888f41efc169805a92d42f70767600b to your computer and use it in GitHub Desktop.
How to parse Newick in python using the parsy library
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python3 | |
import sys | |
import re | |
from parsy import regex, string, success, generate | |
def newick_quote(s): | |
newick_quote_level = 0 | |
if " " in s: | |
newick_quote_level = 1 | |
for char in "[]();:,'_": | |
if char in s: | |
newick_quote_level = 2 | |
if newick_quote_level == 2: | |
return "'" + s.replace("'","''") + "'" | |
elif newick_quote_level == 1: | |
return s.replace(" ","_") | |
else: | |
return s | |
class Tree(object): | |
def __init__(self,name=None,length=None,children=None): | |
self.name=name | |
self.length=length | |
self.children=children | |
def _show(self): | |
s = "" | |
if self.children: | |
cs = [child._show() for child in self.children] | |
s += "("+",".join(cs)+")" | |
if self.name is not None: | |
s += newick_quote(self.name) | |
if self.length is not None: | |
s += ":" + f"{self.length}" | |
return s | |
def show(self): | |
return self._show()+";" | |
def __str__(self): | |
return self.show() | |
# Floating-point literals | |
decimals = regex(r'[0-9]+') | |
# This seems like it should be part of the library | |
@generate | |
def floating(): | |
sign = string("+")|string("-") | |
exponent = (string("e")|string("E"))+option(sign,"")+decimals | |
s = yield option(sign,"") | |
i1 = yield decimals | |
i2 = yield option(string(".")+decimals,"") | |
i3 = yield option(exponent,"") | |
result = s+i1+i2+i3 | |
return float(result) | |
def concat_str(strings): | |
return "".join(strings) | |
# try `parser` and return `value` if it is not there. | |
def option(parser,value=None): | |
return parser | success(value) | |
# Lexer part 1: space parser | |
comment = string("[")>>regex(r"[^\]]*")>>string("]").desc("comment") | |
whitespace = regex(r"[ \t\n]+").desc("whitespace") | |
spaces = (comment | whitespace).many() | success("") | |
# Lexer part 2: in quoted strings, "''" changes to "'" | |
quoted_char = string("''") >> success("'") | regex(r"[^']") | |
quoted_label = (string("'") >> quoted_char.many().map(concat_str) << string("'")).desc('quoted_label') | |
# Lexer part 3: in UNquoted string, "_" changes to " " | |
unquoted_label = (string("_") >> success(" ") | |
| regex(r"[^ ()\[\]':;,]") | |
).many().map(concat_str).desc('unquoted_label') | |
# Parser | |
label = quoted_label | unquoted_label | |
branch_length = string(":") >> spaces >> option(floating) << spaces | |
@generate | |
def subtree(): | |
children = yield option(descendant_list, []) | |
yield spaces | |
node_label = yield option(label) | |
yield spaces | |
length = yield option(branch_length) | |
return Tree(node_label, length, children) | |
@generate | |
def descendant_list(): | |
yield spaces | |
yield string("(") | |
children = yield subtree.sep_by(string(",")) | |
yield string(")") | |
return children | |
@generate | |
def tree(): | |
yield spaces | |
t = yield subtree | |
yield spaces | |
yield string(";") | |
yield spaces | |
return t | |
def check(i,o): | |
print(f'Checking "{i}" -> "{o}"') | |
if str(tree.parse(i)) != o: | |
print(" FAIL: got " + str(tree.parse(i))) | |
def check_error(i): | |
print(f'Checking "{i}" does not parse: ',end="") | |
try: | |
str(tree.parse(i)) | |
print("Parses, but should not!") | |
except: | |
print("OK") | |
if len(sys.argv)>1: | |
print(tree.parse(sys.argv[1])) | |
else: | |
check("('a b',(b,c));", "(a_b,(b,c));") | |
check("(a_b,(b,c));", "(a_b,(b,c));") | |
check("('a_b',(b,c));", "('a_b',(b,c));") | |
check("('a''b',(b,c));", "('a''b',(b,c));") | |
check("('a ,b',(b,c));", "('a ,b',(b,c));") | |
check("('a ,b'[yo dude],(b,c));", "('a ,b',(b,c));") | |
check("('a ,b'[yo dude]:[hi!]0.1,(b,c));", "('a ,b':0.1,(b,c));") | |
check("(a?b : 0.1 , (b , c ));", "(a?b:0.1,(b,c));") | |
check(";", ";") | |
check("();", "();") | |
check("(,);", "(,);") | |
check("((,),);", "((,),);") | |
check("((a,a)a,a)a;", "((a,a)a,a)a;") | |
check("a;", "a;") | |
check("a:0.1;", "a:0.1;") | |
check("a:1.0;", "a:1.0;") | |
check("a:1;", "a:1.0;") | |
check("a[node]:0.1[branch];", "a:0.1;") | |
check_error("(a''b,(b,c));") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment