Created
March 27, 2017 14:39
-
-
Save fbrundu/a5aa436af7f3f1ff8069c7b31416f56b to your computer and use it in GitHub Desktop.
Retrieve TCGA gene expression data using GDC api
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import logging as log | |
import pandas as pd | |
import requests as rq | |
class TCGA: | |
def __init__(self, gdc_url='https://gdc-api.nci.nih.gov', per_page=100, | |
logfile=None): | |
''' Initialisation ''' | |
self.gdc_url = gdc_url | |
self.per_page = per_page | |
log.basicConfig(filename=logfile, level=log.INFO, | |
format='%(asctime)s : %(levelname)8s : %(message)s (%(module)s.%(funcName)s)', | |
datefmt='%Y-%m-%d %H:%M:%S') | |
def get_geneexp(self, projects, exp_strategy='RNA-Seq', | |
workflow='HTSeq - FPKM'): | |
filters = _FilterBuilder.logical( | |
'and', [ | |
_FilterBuilder.equal('files.data_type', | |
'Gene Expression Quantification'), | |
_FilterBuilder.equal('experimental_strategy', exp_strategy), | |
_FilterBuilder.equal('files.analysis.workflow_type', workflow), | |
_FilterBuilder.inclusion('cases.project.project_id', projects)]) | |
file_ids = self._get_file_ids(filters) | |
log.info(f'{len(file_ids)} files found') | |
df = None | |
for i, fid in enumerate(file_ids): | |
f = pd.read_table(f'{self.gdc_url}/data/{fid}', compression='gzip', | |
index_col=0, header=None) | |
df = pd.concat([df, f], axis=1) | |
log.info(f'{i+1:4}. File {fid} integrated') | |
df.columns = file_ids | |
return df | |
def _get_file_ids(self, filters): | |
file_ids = [] | |
resp = rq.post(f'{self.gdc_url}/files?size={self.per_page}', | |
json={'filters': filters}) | |
if resp.status_code == 200: | |
resp = resp.json() | |
meta = resp['data']['pagination'] | |
file_ids += [h['file_id'] for h in resp['data']['hits']] | |
if meta['pages'] > 1: | |
for _from in range(self.per_page + 1, meta['total'], self.per_page): | |
resp = rq.post( | |
f'{self.gdc_url}/files?size={self.per_page}&from={_from}', | |
json={'filters': filters}) | |
if resp.status_code == 200: | |
resp = resp.json() | |
file_ids += [h['file_id'] for h in resp['data']['hits']] | |
return file_ids | |
class _FilterBuilder: | |
@staticmethod | |
def logical(op, args): | |
''' Logical operator ''' | |
_filter = { 'op': op, 'content': [o for o in args] } | |
return _filter | |
@staticmethod | |
def inclusion(field, values): | |
''' Inclusion operator ''' | |
if len(values) < 1: | |
raise RuntimeError(f'Invalid number of values: {len(values)}') | |
_filter = { 'op': 'in', 'content': { 'field': field, 'value': values }} | |
return _filter | |
@staticmethod | |
def equal(field, value): | |
''' Equal operator ''' | |
_filter = { 'op': '=', 'content': { 'field': field, 'value': value }} | |
return _filter |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment