Last active
October 15, 2021 12:56
-
-
Save PoisonAlien/4a5fcc2126a731417671c4738bcd94cc to your computer and use it in GitHub Desktop.
Tool to extract nucleotide counts at user specific loci
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
//A minimal program to extract nucelotide counts of selected genomic loci from the BAM file | |
//gcc -g -O3 -pthread ntcounts.c -lhts -Ihts -o ntcounts | |
//MIT License | |
//Copyright (c) 2021 Anand Mayakonda <anandmt3@gmail.com> | |
#include <unistd.h> | |
#include <stdio.h> | |
#include <string.h> | |
#include <stdlib.h> | |
#include <htslib/sam.h> | |
#include <htslib/faidx.h> | |
#include <argp.h> | |
#define PBSTR "||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||" | |
#define PBWIDTH 60 | |
void is_bam(char *fname); | |
char *basename(char const *path); | |
char *removeExt(char* myStr); | |
int countlines(char *filename); | |
void printProgress(double percentage, int done, int tot); | |
void usage(); | |
char *VERSION = "0.1.0"; | |
int main (int argc, char **argv){ | |
char *fafile = NULL; | |
char *op = NULL; | |
int c; | |
char *bedfile = NULL; | |
char *bam = NULL; | |
uint32_t q = 10; //BAM default MAPQ | |
uint16_t F = 3852; //BAM default FLAG | |
int vars_gt = 1; //No. of BED entries | |
int nloci = 0; | |
opterr = 0; | |
hts_verbose = 0; //suppresses htslib warnings | |
//Parse cl args | |
while ((c = getopt (argc, argv, "q:o:f:F:")) != -1){ | |
switch (c) | |
{ | |
case 'q': | |
q = atoi(optarg); | |
break; | |
case 'o': | |
op = optarg; | |
break; | |
case 'f': | |
fafile = atoi(optarg); | |
break; | |
case 'F': | |
F = atoi(optarg); | |
break; | |
case '?': | |
if (optopt == 'm'){ | |
fprintf (stderr, "MAPQ must be provided when -%c is specified.\n", optopt); | |
usage(); | |
} | |
if (optopt == 'v'){ | |
fprintf (stderr, "VAF must be provided when -%c is specified.\n", optopt); | |
usage(); | |
} | |
if (optopt == 'F'){ | |
fprintf (stderr, "FLAG must be provided when -%c is specified.\n", optopt); | |
usage(); | |
} | |
if (optopt == 'f'){ | |
usage(); | |
fprintf (stderr, "Input fasta file must provided when -%c is specified.\n", optopt); | |
} else if (isprint (optopt)){ | |
fprintf (stderr, "Unknown option `-%c'.\n", optopt); | |
usage(); | |
}else{ | |
fprintf (stderr, "Unknown option character `\\x%x'.\n", optopt); | |
usage(); | |
} | |
return 1; | |
default: | |
abort (); | |
} | |
} | |
bedfile = argv[optind]; | |
if(bedfile == NULL){ | |
usage(); | |
fprintf(stderr, "Missing loci file!\n"); | |
return 0; | |
} | |
bam = argv[optind+1]; | |
if(bam == NULL){ | |
usage(); | |
fprintf(stderr, "Missing BAM file!\n"); | |
return 0; | |
} | |
is_bam(bam); //Check the file extension to to figure out BAM file or not | |
char tsv_file[1000]; | |
char *bn = basename(bam); //get basename of the file | |
char *bnn = removeExt(bn); //remove extension from the basename | |
if(op == NULL){ | |
strcpy(tsv_file, bnn); | |
strcat(tsv_file, ".tsv"); | |
}else{ | |
strcpy(tsv_file, op); | |
strcat(tsv_file, ".tsv"); | |
} | |
//Open bed file | |
FILE *bed_fp; | |
nloci = countlines(bedfile); | |
bed_fp = fopen(bedfile, "r"); | |
char buff[1000]; | |
//Open TSV report file and print HTML header | |
FILE *tsv_fp; | |
tsv_fp = fopen(tsv_file, "w" ); | |
fprintf(tsv_fp, "loci\tfa_ref\tA\tT\tG\tC\tIns\tDel\n"); | |
//fasta file | |
char *seq; | |
faidx_t *fa = fai_load(fafile); | |
//BAM file | |
samFile *fp_in = hts_open(bam,"r"); //open bam file | |
hts_idx_t *fp_idx = sam_index_load(fp_in, bam); | |
bam_hdr_t *bamHdr = sam_hdr_read(fp_in); //read header | |
bam1_t *aln = bam_init1(); //initialize an alignment | |
fprintf(stderr, "Input BAM file : %s\n", bam); | |
fprintf(stderr, "Loci file : %s\n", bedfile); | |
fprintf(stderr, "No. of loci : %d\n", nloci); | |
fprintf(stderr, "MAPQ filter : %d\n", q); | |
fprintf(stderr, "FLAG filter : %d\n", F); | |
fprintf(stderr, "HTSlib version : %s\n\n", hts_version()); | |
//For every loci in the BED file | |
while(fgets(buff,1000,bed_fp) != NULL){ | |
//Remove trailing new line chars | |
int len = strlen(buff); | |
if(buff[len-1] == '\n' ){ | |
buff[len-1] = 0; | |
} | |
char *chrom = strtok(buff,"\t"); | |
char *start = strtok(NULL,"\t"); | |
char loci[250] = ""; | |
strcat(loci, chrom); strcat(loci, ":"); strcat(loci, start); strcat(loci, "-"); strcat(loci, start); | |
//Fetch base at target loci from fasta file | |
if(fa != NULL){ | |
int templen = 100; | |
seq = fai_fetch(fa, loci, &templen); | |
fprintf(tsv_fp, "%s:%s\t%s", chrom, start, seq); | |
free(seq); | |
}else{ | |
fprintf(tsv_fp, "%s:%s\tNA", chrom, start); | |
} | |
int32_t target_pos = atoi(start) -1; //input position are 1 based | |
//load reads in target loci | |
hts_itr_t *samitr = sam_itr_querys(fp_idx, bamHdr, loci); | |
//Keep track of total reads and nt counts per loci | |
int32_t tot_reads = 0; | |
float nt[6] = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0}; | |
//fprintf(stderr, "\rPercent completed: %d [%d of %d]", 100 * vars_gt/nloci, vars_gt, nloci); | |
//fflush(stdout); | |
printProgress(vars_gt/(double)nloci, vars_gt, nloci); | |
vars_gt = vars_gt + 1; | |
//For every read in the BAM file of target region | |
while(sam_itr_next(fp_in, samitr, aln) > 0){ | |
int32_t pos = aln->core.pos ; //left most position of alignment in zero based coordianate (0-based) | |
uint32_t len = aln->core.l_qseq; //length of the read. | |
uint32_t* cig = bam_get_cigar(aln); | |
uint8_t *qs = bam_get_seq(aln); //quality string | |
//MAPQ and FLAG filter | |
if(aln->core.qual <= q){ | |
continue; | |
} | |
if(aln->core.flag >= F){ | |
continue; | |
} | |
tot_reads = tot_reads +1; | |
//char ins_seq[100]; char del_seq[100]; | |
//get nucleotide id and converts them into IUPAC id. | |
char *qseq = (char *)malloc(len); | |
int i = 0; | |
for(i=0; i< len ; i++){ | |
qseq[i] = seq_nt16_str[bam_seqi(qs,i)]; | |
} | |
//target position on the read | |
int32_t pos_onread = 0; | |
//For every CIGAR string | |
int k = 0; | |
for(k=0;k< aln->core.n_cigar ;++k){ | |
int cop =cig[k] & BAM_CIGAR_MASK; // CIGAR string | |
int cl = cig[k] >> BAM_CIGAR_SHIFT; // CIGAR length | |
if(BAM_CIGAR_STR[cop] == 'M'){ | |
pos_onread = pos_onread + cl; | |
pos = pos + cl; | |
}else if(BAM_CIGAR_STR[cop] == 'S'){ | |
pos_onread = pos_onread + cl; | |
}else if(BAM_CIGAR_STR[cop] == 'I'){ | |
pos_onread = pos_onread + cl; | |
}else if(BAM_CIGAR_STR[cop] == 'D'){ | |
pos = pos + cl; | |
} | |
if(pos > target_pos){ | |
if(BAM_CIGAR_STR[cop] == 'M'){ | |
pos_onread = pos_onread - (pos - target_pos); | |
if(qseq[pos_onread] == 'A'){ | |
nt[0] = nt[0] + 1; | |
}else if(qseq[pos_onread] == 'T'){ | |
nt[1] = nt[1] + 1; | |
}else if(qseq[pos_onread] == 'G'){ | |
nt[2] = nt[2] + 1; | |
}else if(qseq[pos_onread] == 'C'){ | |
nt[3] = nt[3] + 1; | |
} | |
break; | |
} | |
}else if(pos == target_pos){ | |
if(BAM_CIGAR_STR[cop] == 'I'){ | |
nt[4] = nt[4] + 1; | |
// insertion sequence | |
// for(int i = 0; i < cl; i++){ | |
// //strcat(ins_seq, &qseq[pos_onread+i]); | |
// } | |
break; | |
}else if(BAM_CIGAR_STR[cop] == 'D'){ | |
nt[5] = nt[5] + 1; | |
// deletion sequence | |
// for(int i = 0; i < cl; i++){ | |
// strcat(del_seq, qseq[pos_onread+i]); | |
// } | |
break; | |
} | |
} | |
} | |
free(qseq); | |
} | |
hts_itr_destroy(samitr); | |
fprintf(tsv_fp, "\t%.f\t%.f\t%.f\t%.f\t%.f\t%.f\n", nt[0], nt[1], nt[2], nt[3], nt[4], nt[5]); | |
//write nt counts as table row | |
} | |
bam_destroy1(aln); | |
bam_hdr_destroy(bamHdr); | |
fai_destroy(fa); | |
sam_close(fp_in); | |
fclose(bed_fp); | |
fclose(tsv_fp); | |
free(bn); | |
free(bnn); | |
fprintf(stderr, "\n\nDone!\n"); | |
return 0; | |
} | |
void is_bam(char *fname){ | |
int l = strlen(fname); | |
if (l>=4 && strcasecmp(fname+l-4, ".bam") != 0){ | |
usage(); | |
fprintf(stderr, "%s is not a BAM file!\n", fname); | |
exit(0); | |
}else if(l<4){ | |
fprintf(stderr, "%s is not a BAM file!\n", fname); | |
usage(); | |
exit(0); | |
} | |
} | |
//from: https://stackoverflow.com/questions/3288006/are-there-any-c-apis-to-extract-the-base-file-name-from-its-full-path-in-linux | |
char *basename(char const *path){ | |
char *s = strrchr(path, '/'); | |
if (!s){ | |
return strdup(path); | |
}else{ | |
return strdup(s + 1); | |
} | |
} | |
//from: https://stackoverflow.com/questions/2736753/how-to-remove-extension-from-file-name | |
char *removeExt(char* myStr) { | |
char *retStr; | |
char *lastExt; | |
if (myStr == NULL) return NULL; | |
if ((retStr = malloc (strlen (myStr) + 1)) == NULL) return NULL; | |
strcpy (retStr, myStr); | |
lastExt = strrchr (retStr, '.'); | |
if (lastExt != NULL) | |
*lastExt = '\0'; | |
return retStr; | |
} | |
void usage(){ | |
fprintf(stderr, "-------------------------------------------------------------------------\n"); | |
fprintf(stderr, "ntcounts: A tool to extract nucleotide counts\n"); | |
fprintf(stderr, " of genomic loci from the BAM file. Version: %s\n", VERSION); | |
fprintf(stderr, "USAGE:\n"); | |
fprintf(stderr, " ntcounts [OPTIONS] <loci> <bam>\n"); | |
fprintf(stderr, " e.g; ntcounts target_loci.tsv Tumor.bam\n"); | |
fprintf(stderr, "OPTIONS:\n"); | |
fprintf(stderr, " -f Indexed fasta file. If provided, extracts and adds reference base to the ouput tsv\n"); | |
fprintf(stderr, " -q Min. mapping quality threshold. Default 10 [Read filter]\n"); | |
fprintf(stderr, " -F Exclude reads with FLAGS >= INT. Default 3852 [Read filter]\n"); | |
fprintf(stderr, " -o Output file basename. Default parses from basename of BAM file\n"); | |
fprintf(stderr, "ARGS:\n"); | |
fprintf(stderr, " <loci> variant file (chr\\tpos)\n"); | |
fprintf(stderr, " <bam> Indexed BAM file\n"); | |
fprintf(stderr, "OUPUT:\n"); | |
fprintf(stderr, " <output>.tsv TSV file with nucletide counts for all variants\n"); | |
fprintf(stderr, "-------------------------------------------------------------------------\n"); | |
} | |
int countlines(char *filename){ | |
// count the number of lines in the file called filename | |
FILE *fp = fopen(filename,"r"); | |
int ch=0; | |
int lines=0; | |
if (fp == NULL){ | |
return 0; | |
} | |
while(!feof(fp)){ | |
ch = fgetc(fp); | |
if(ch == '\n'){ | |
lines++; | |
} | |
} | |
fclose(fp); | |
return lines; | |
} | |
// Progress bar modified from: https://stackoverflow.com/a/36315819 | |
void printProgress(double percentage, int done, int tot) { | |
int val = (int) (percentage * 100); | |
int lpad = (int) (percentage * PBWIDTH); | |
int rpad = PBWIDTH - lpad; | |
fprintf(stderr, "\r%3d%% [%.*s%*s] %d/%d", val, lpad, PBSTR, rpad, "", done, tot); | |
fflush(stdout); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment