Created
February 25, 2019 00:34
-
-
Save linhai86/4d40deda3fe75b1e971b7083aa371d68 to your computer and use it in GitHub Desktop.
Given mutation, extract flanking protein sequence.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
class Mut2Seq: | |
"""Given mutation, extract flanking protein sequence using Ensembl API. | |
""" | |
def __init__(self, server='https://rest.ensembl.org', | |
species='homo_sapiens'): | |
self.server = server | |
self.species = species | |
def mut2seq(self, chrom, start, end, allele, flank=5): | |
results = [] | |
hgvsp = self._get_hgvsp(chrom, start, end, allele) | |
for k, v in hgvsp.items(): | |
results.append(self._get_protein_seq(v, flank)) | |
return results | |
def _get_hgvsp(self, chrom, start, end, allele): | |
ext = '/vep/{species}/region/{chrom}:{start}:{end}/{allele}?canonical=1&hgvs=1&distance=0&protein=1'.format( | |
species=self.species, | |
chrom=chrom, | |
start=start, | |
end=end, | |
allele=allele | |
) | |
headers = { "Content-Type" : "application/json"} | |
response = requests.get(self.server + ext, headers=headers) | |
if not response.ok: | |
response.raise_for_status() | |
response_json = response.json() | |
assert len(response_json) == 1 | |
response_json = response_json.pop() | |
hgvsp = {} | |
if 'transcript_consequences' in response_json: | |
for transcript in response_json['transcript_consequences']: | |
if 'hgvsp' in transcript: | |
hgvsp[transcript['hgvsp']] = { | |
'protein_id': transcript['protein_id'], | |
'protein_start': transcript['protein_start'], | |
'protein_end': transcript['protein_end'], | |
'amino_acids': transcript['amino_acids'].split('/'), | |
} | |
return hgvsp | |
def _get_protein_seq(self, hgvsp, flank): | |
ext = '/sequence/id/{protein_id}?'.format(protein_id=hgvsp['protein_id']) | |
headers = { "Content-Type" : "text/plain"} | |
response = requests.get(self.server + ext, headers=headers) | |
if not response.ok: | |
response.raise_for_status() | |
seq = response.text | |
assert seq[hgvsp['protein_start']-1:hgvsp['protein_end']] == hgvsp['amino_acids'][0] | |
extract_start = max(hgvsp['protein_start'] - 1 - flank, 0) | |
extract_end = min(hgvsp['protein_end'] + flank, len(seq)) | |
hgvsp['ref_seq'] = seq[extract_start:extract_end] | |
if len(hgvsp['amino_acids']) == 2: | |
hgvsp['alt_seq'] = seq[extract_start:hgvsp['protein_start']-1] + hgvsp['amino_acids'][1] + seq[hgvsp['protein_end']:extract_end] | |
return hgvsp | |
m2s = EnsMut2Seq() | |
x = m2s.mut2seq(1, 6526802, 6526802, 'C') | |
# x: | |
# [{'protein_id': 'ENSP00000366934', | |
# 'protein_start': 618, | |
# 'protein_end': 618, | |
# 'amino_acids': ['K', 'R'], | |
# 'ref_seq': 'GIDMEKRLYHI', | |
# 'alt_seq': 'GIDMERRLYHI'}] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment