Skip to content

Instantly share code, notes, and snippets.

@dmckean
Created May 9, 2018 21:58
Show Gist options
  • Save dmckean/9b641956fde4bd710e0be559d701f6fd to your computer and use it in GitHub Desktop.
Save dmckean/9b641956fde4bd710e0be559d701f6fd to your computer and use it in GitHub Desktop.
bcftools plugin for binning and tabulating integer FORMAT data
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <getopt.h>
#include <unistd.h>
#include <inttypes.h>
#include <htslib/vcf.h>
#include "bcftools.h"
#include "version.c"
int nsmpl, rMin, rStep, rMax, missing, minima, maxima;
size_t nbins;
char * fmt_str;
bcf_hdr_t *hdr;
const char *about(void)
{
return "Bins and tabulates FORMAT data for each VCF record.\n";
}
const char *usage(void) {
return
"\n"
"About: Bin and tabulate integer FORMAT fields for each VCF record..\n"
"Usage: bcftools +tabulate <in.bcf/.vcf.gz> [General Options] -- [Plugin Options]\n"
"\n"
"Options:\n"
" run \"bcftools plugin\" for a list of common options\n"
"\n"
"Plugin options:\n"
" -f --format FMT_ID ID for format field (required)\n"
"\n"
" -b, --begin BEGIN_INT Lowest interval for binned output [0]\n"
" -s, --step STEP_INT Bin width for binned output [1]\n"
" -e --end END_INT Max interval for binned output [100]\n"
"\n"
" --no-missing Ignore missing values (e.g. non numeric values, like '.')\n"
" --no-minima Ignores values below `BEGIN_INT`\n"
" --no-maxima Ignores values greater than or equal to `END_INT`\n"
"\n"
"Example:\n"
" bcftools +tabulate in.bcf -- -f DP\n"
"\n";
}
int init(int argc, char **argv, bcf_hdr_t *in_hdr, bcf_hdr_t *out_hdr)
{
hdr = in_hdr;
nsmpl = bcf_hdr_nsamples(hdr);
rMin = 0;
rStep = 1;
rMax = 100;
missing=1;
minima=1;
maxima=1;
int c;
static struct option loptions[] =
{
{"format", required_argument, NULL, 'f'},
{"begin", optional_argument, NULL, 'b'},
{"step", optional_argument, NULL, 's'},
{"end", optional_argument, NULL, 'e'},
{"no-missing", no_argument, NULL, 'm'},
{"no-minima", no_argument, NULL, 'n'},
{"no-maxima", no_argument, NULL, 'p'},
{NULL, no_argument, NULL, 0}
};
while ((c = getopt_long(argc, argv, "f:b:s:e:amnp", loptions, NULL)) >= 0) {
switch (c) {
case 'f':
fmt_str = optarg;
case 'b':
rMin = (int) strtol(optarg, NULL, 10);
break;
case 's':
rStep = (int) strtol(optarg, NULL, 10);
break;
case 'e':
rMax = (int) strtol(optarg, NULL, 10);
break;
case 'm':
missing=0;
break;
case 'n':
minima=0;
break;
case 'p':
maxima=0;
break;
default:
fprintf(stderr, "%s\n", usage());
exit(EXIT_FAILURE);
}
}
if( fmt_str == NULL ) {
fprintf(stderr, "[error]: missing --format FMT_STR\n");
exit(EXIT_FAILURE);
}
if( rMax < rMin ) {
fprintf(stderr, "[error]: END_INT: %d must be greater than or equal to BEGIN_INT: %d\n", rMax, rMin);
exit(EXIT_FAILURE);
}
if( rStep < 1 ) {
fprintf(stderr, "[error]: STEP_INT: %d must be greater than 0.\n", rStep);
exit(EXIT_FAILURE);
}
int fmt_type = bcf_hdr_id2type(hdr, BCF_HL_FMT, bcf_hdr_id2int(hdr, BCF_DT_ID, (const char*) fmt_str));
if ( fmt_type != BCF_HT_INT ) {
char fmt_type_str[10] = "";
if(fmt_type == BCF_HT_FLAG) {
strcat(fmt_type_str, "flag");
} else if ( fmt_type == BCF_HT_REAL ) {
strcat(fmt_type_str, "float");
} else if(fmt_type == BCF_HT_STR) {
strcat(fmt_type_str, "string");
} else {
strcat(fmt_type_str, "unknown");
}
fprintf(stderr, "[error]: Invalid type '%s' for FORMAT ID=%s\n", fmt_type_str, fmt_str);
exit(EXIT_FAILURE);
}
int r = rMax - rMin;
nbins = (unsigned int) r / rStep + ((r % rStep) > 0);
printf("#CHROM\tPOS\tID\tREF\tALT");
if(missing == 1) printf("\t%s_missing", fmt_str);
if(minima == 1) printf("\t(,%d)", rMin);
if(rStep == 1) {
for(int i = rMin; i < rMax; i++) printf("\t%d", i);
} else {
for(int i = rMin; i < rMax; i += rStep) printf("\t[%d,%d)", i, ((i + rStep < rMax) ? i + rStep : rMax));
}
if(maxima == 1) printf("\t[%d,)", rMax);
printf("\n");
return 1;
}
void fprintf_rec(bcf1_t *rec, FILE *stream) {
fprintf(stream, "%s\t%d\t%s\t%s\t%s", bcf_seqname(hdr, rec), rec->pos + 1, rec->d.id, rec->d.allele[0], rec->d.allele[1]);
for(int i = 2; i < rec->n_allele; i++) fprintf(stream, ",%s", rec->d.allele[i]);
}
void process_int(bcf1_t *rec, int *val_totals) {
int nval, arr_len = 0;
int32_t *val_arr = NULL;
nval = bcf_get_format_int32(hdr, rec, fmt_str, (void**)(&val_arr), &arr_len);
if( nval != nsmpl ) {
if (nval < nsmpl) {fprintf(stderr, "Values missing on "); }
else {fprintf(stderr, "Extra values on ");}
fprintf_rec(rec, stderr);
fprintf(stderr, "\n %d samples found, %d values for FMT ID=%s found\n", nsmpl, nval, fmt_str);
exit(EXIT_FAILURE);
}
int fmt_val;
for(int i = 0; i < nsmpl; i++) {
fmt_val = val_arr[i];
if (fmt_val == bcf_int32_missing) {
if (missing == 1) {
fmt_val = 0;
} else {
continue;
}
} else {
if(fmt_val >= rMax) {
if (maxima) {
fmt_val = (int) nbins + missing + minima;
} else {
continue;
}
}
else if (fmt_val < rMin) {
if (minima) {
fmt_val = 0;
} else {
continue;
}
} else {
fmt_val = (fmt_val - rMin) / rStep + missing + minima;
}
}
val_totals[fmt_val]++;
}
free(val_arr);
}
// Called for each VCF record, return NULL to suppress the output
bcf1_t *process(bcf1_t *rec)
{
bcf_unpack(rec, BCF_UN_FMT);
int *val_totals = calloc(missing + minima + nbins + maxima, sizeof(int));
process_int(rec, val_totals);
fprintf_rec(rec, stdout);
for(int i = 0; i < (missing + minima + nbins + maxima); i++) printf("\t%d", val_totals[i]);
printf("\n");
free(val_totals);
return NULL;
}
void destroy(void) {
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment