Skip to content

Instantly share code, notes, and snippets.

@PoisonAlien
Last active July 17, 2024 17:17
Show Gist options
  • Save PoisonAlien/ce87512ead051da3a36019b78ebc675c to your computer and use it in GitHub Desktop.
Save PoisonAlien/ce87512ead051da3a36019b78ebc675c to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
#A simple script to predict gnomAD ancestry using PCA loadings trained on gnomAD V3 datasets
#See here for details: https://gnomad.broadinstitute.org/news/2021-09-using-the-gnomad-ancestry-principal-components-analysis-loadings-and-random-forest-classifier-on-your-dataset/
#Author: Anand Mayakonda
import sys
import os.path
import shutil
import argparse
parser = argparse.ArgumentParser(
description="Predict ancestry with gnomad ancestry RF models")
parser.add_argument("-v", "--vcf", help="Input VCF",required=True)
parser.add_argument("-p", "--pca", help="PCA loadings",required=True)
parser.add_argument("-r", "--rf", help="Rnadom forest models", required=True)
parser.add_argument("-b", "--name", help="Output basename", required=True)
args = parser.parse_args()
VCF = args.vcf #Input VCF file
pcloadings = args.pca #gnomad PCA loadings
rfmodel = args.rf #gnomad RF model
vcfbase = args.name #Output file basename
#temp_directory=sys.argv[5] #do not use /tmp
import hail as hl
import sklearn
import pickle
from gnomad.sample_qc.ancestry import assign_population_pcs
hl.init(tmp_dir="./")
loadings_ht = hl.read_table(pcloadings)
mtout = vcfbase + ".mt"
hl.import_vcf(VCF, reference_genome='GRCh38', force_bgz=True,array_elements_required=False).write(mtout, overwrite=True)
mt_to_project = hl.read_matrix_table(mtout)
ht = hl.experimental.pc_project(
mt_to_project.GT,
loadings_ht.loadings,
loadings_ht.pca_af,
)
with hl.hadoop_open(rfmodel, "rb") as f:
fit = pickle.load(f)
num_pcs = fit.n_features_in_
ht = ht.annotate(scores=ht.scores[:num_pcs])
#vcf doesnt have the column 'known_pop' required by hail
ht = ht.annotate(known_pop="NA")
ht, rf_model = assign_population_pcs(
ht,
pc_cols=ht.scores,
fit=fit,
)
ht.export(vcfbase+'.rf.est.tsv', delimiter='\t')
print("Cleaning up..")
shutil.rmtree(mtout, ignore_errors=True)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment