Skip to content

Commit c6cbcc7

Browse files
committed
Merge branch 'bt-2271-query-expr' into develop
2 parents 9d31418 + 6be9e81 commit c6cbcc7

File tree

9 files changed

+145
-5
lines changed

9 files changed

+145
-5
lines changed

convert.c

Lines changed: 111 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,7 @@ THE SOFTWARE. */
7979
#define T_VKX 31 // VARIANTKEY HEX
8080
#define T_PBINOM 32
8181
#define T_NPASS 33
82+
#define T_FILTER_EXPR 34 // print the results of -i/-e functions via query
8283

8384
typedef struct _fmt_t
8485
{
@@ -123,6 +124,16 @@ typedef struct
123124
}
124125
bcsq_t;
125126

127+
typedef struct
128+
{
129+
filter_t *filter;
130+
int nval;
131+
double *val;
132+
}
133+
filter_expr_t;
134+
135+
static fmt_t *register_tag(convert_t *convert, char *key, int is_gtf, int type);
136+
126137
static void process_chrom(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) { kputs(convert->header->id[BCF_DT_CTG][line->rid].key, str); }
127138
static void process_pos(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) { kputw(line->pos+1, str); }
128139
static void process_pos0(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) { kputw(line->pos, str); }
@@ -1157,6 +1168,50 @@ static void destroy_npass(void *usr)
11571168
{
11581169
filter_destroy((filter_t*)usr);
11591170
}
1171+
static void process_filter_expr(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str)
1172+
{
1173+
filter_expr_t *dat = (filter_expr_t*) fmt->usr;
1174+
1175+
int i, nval, nval1;
1176+
const double *val;
1177+
if ( fmt->is_gt_field )
1178+
{
1179+
if ( !fmt->ready )
1180+
{
1181+
filter_test(dat->filter,line,NULL);
1182+
val = filter_get_doubles(dat->filter,&nval,&nval1);
1183+
if ( fmt->is_gt_field )
1184+
{
1185+
if ( !dat->nval )
1186+
{
1187+
dat->nval = nval;
1188+
dat->val = malloc(nval*sizeof(double));
1189+
if ( !dat->val ) error("Error: failed to allocate %zu bytes\n",nval*sizeof(double));
1190+
}
1191+
assert( dat->nval==nval );
1192+
for (i=0; i<nval; i++) dat->val[i] = val[i];
1193+
}
1194+
fmt->ready = 1;
1195+
}
1196+
val = dat->val;
1197+
nval = dat->nval;
1198+
}
1199+
else
1200+
{
1201+
filter_test(dat->filter,line,NULL);
1202+
val = filter_get_doubles(dat->filter,&nval,&nval1);
1203+
}
1204+
if ( isample<0 ) isample = 0;
1205+
if ( isample>=nval ) isample = 0;
1206+
kputd(val[isample], str);
1207+
}
1208+
static void destroy_filter_expr(void *usr)
1209+
{
1210+
filter_expr_t *dat = (filter_expr_t*) usr;
1211+
filter_destroy(dat->filter);
1212+
free(dat->val);
1213+
free(dat);
1214+
}
11601215

11611216
static void process_pbinom(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str)
11621217
{
@@ -1249,6 +1304,48 @@ static void _used_tags_add(convert_t *convert, int type, char *key)
12491304
else if ( !strcmp("MASK",key) ) { function(__VA_ARGS__, T_MASK); } \
12501305
else if ( !strcmp("LINE",key) ) { function(__VA_ARGS__, T_LINE); }
12511306

1307+
// This invokes the functionality of -i/-e expressions
1308+
static char *set_filter_expr(convert_t *convert, char *key, int is_gtf)
1309+
{
1310+
kstring_t str = {0,0,0};
1311+
char *ptr = key;
1312+
while ( *ptr && *ptr!=')' ) ptr++;
1313+
if ( !*ptr ) error("Could not parse format string: %s\n",convert->format_str);
1314+
kputsn(key, ptr-key+1, &str);
1315+
register_tag(convert, str.s, is_gtf, T_FILTER_EXPR);
1316+
free(str.s);
1317+
return key+str.l;
1318+
}
1319+
1320+
// These are the -i/-e functions made to be printed via `query -f`
1321+
#define _SET_FILTER_EXPR(convert,function,key,ptr,is_gtf) \
1322+
if ( !strncasecmp(key,"MAX(",4) ) { ptr = function(convert,key,is_gtf); } \
1323+
else if ( !strncasecmp(key,"MIN(",4) ) { ptr = function(convert,key,is_gtf); } \
1324+
else if ( !strncasecmp(key,"MEAN(",5) ) { ptr = function(convert,key,is_gtf); } \
1325+
else if ( !strncasecmp(key,"MEDIAN(",7) ) { ptr = function(convert,key,is_gtf); } \
1326+
else if ( !strncasecmp(key,"AVG(",4) ) { ptr = function(convert,key,is_gtf); } \
1327+
else if ( !strncasecmp(key,"SUM(",4) ) { ptr = function(convert,key,is_gtf); } \
1328+
else if ( !strncasecmp(key,"ABS(",4) ) { ptr = function(convert,key,is_gtf); } \
1329+
else if ( !strncasecmp(key,"COUNT(",6) ) { ptr = function(convert,key,is_gtf); } \
1330+
else if ( !strncasecmp(key,"STDEV(",6) ) { ptr = function(convert,key,is_gtf); } \
1331+
else if ( !strncasecmp(key,"STRLEN(",7) ) { ptr = function(convert,key,is_gtf); } \
1332+
else if ( !strncasecmp(key,"BINOM(",6) ) { ptr = function(convert,key,is_gtf); } \
1333+
else if ( !strncasecmp(key,"PHRED(",6) ) { ptr = function(convert,key,is_gtf); } \
1334+
else if ( !strncasecmp(key,"SMPL_MAX(",9) ) { ptr = function(convert,key,is_gtf); } \
1335+
else if ( !strncasecmp(key,"SMPL_MIN(",9) ) { ptr = function(convert,key,is_gtf); } \
1336+
else if ( !strncasecmp(key,"SMPL_MEAN(",10) ) { ptr = function(convert,key,is_gtf); } \
1337+
else if ( !strncasecmp(key,"SMPL_MEDIAN(",12) ) { ptr = function(convert,key,is_gtf); } \
1338+
else if ( !strncasecmp(key,"SMPL_AVG(",9) ) { ptr = function(convert,key,is_gtf); } \
1339+
else if ( !strncasecmp(key,"SMPL_STDEV(",11) ) { ptr = function(convert,key,is_gtf); } \
1340+
else if ( !strncasecmp(key,"SMPL_SUM(",9) ) { ptr = function(convert,key,is_gtf); } \
1341+
else if ( !strncasecmp(key,"sMAX(",5) ) { ptr = function(convert,key,is_gtf); } \
1342+
else if ( !strncasecmp(key,"sMIN(",5) ) { ptr = function(convert,key,is_gtf); } \
1343+
else if ( !strncasecmp(key,"sMEAN(",6) ) { ptr = function(convert,key,is_gtf); } \
1344+
else if ( !strncasecmp(key,"sMEDIAN(",8) ) { ptr = function(convert,key,is_gtf); } \
1345+
else if ( !strncasecmp(key,"sAVG(",5) ) { ptr = function(convert,key,is_gtf); } \
1346+
else if ( !strncasecmp(key,"sSTDEV(",7) ) { ptr = function(convert,key,is_gtf); } \
1347+
else if ( !strncasecmp(key,"sSUM(",5) ) { ptr = function(convert,key,is_gtf); }
1348+
12521349
static void set_type(fmt_t *fmt, int type) { fmt->type = type; }
12531350
static fmt_t *register_tag(convert_t *convert, char *key, int is_gtf, int type)
12541351
{
@@ -1273,8 +1370,8 @@ static fmt_t *register_tag(convert_t *convert, char *key, int is_gtf, int type)
12731370
if ( fmt->type==T_FORMAT && !bcf_hdr_idinfo_exists(convert->header,BCF_HL_FMT,id) )
12741371
{
12751372
_SET_NON_FORMAT_TAGS(set_type,key,fmt)
1276-
else if ( !strcmp("ALT",key) ) { fmt->type = T_ALT; }
1277-
else if ( !strcmp("_CHROM_POS_ID",key) ) { fmt->type = T_CHROM_POS_ID; }
1373+
else if ( !strcmp("ALT",key) ) { fmt->type = T_ALT; }
1374+
else if ( !strcmp("_CHROM_POS_ID",key) ) { fmt->type = T_CHROM_POS_ID; }
12781375
else if ( !strcmp("RSX",key) ) { fmt->type = T_RSX; }
12791376
else if ( !strcmp("VKX",key) ) { fmt->type = T_VKX; }
12801377
else if ( id>=0 && bcf_hdr_idinfo_exists(convert->header,BCF_HL_INFO,id) )
@@ -1295,6 +1392,14 @@ static fmt_t *register_tag(convert_t *convert, char *key, int is_gtf, int type)
12951392
convert->max_unpack |= filter_max_unpack(flt);
12961393
fmt->usr = (void*) flt;
12971394
}
1395+
else if ( fmt->type==T_FILTER_EXPR )
1396+
{
1397+
filter_t *filter = filter_init(convert->header,key);
1398+
convert->max_unpack |= filter_max_unpack(filter);
1399+
filter_expr_t *dat = calloc(1,sizeof(filter_expr_t));
1400+
fmt->usr = dat;
1401+
dat->filter = filter;
1402+
}
12981403
}
12991404

13001405
switch (fmt->type)
@@ -1332,6 +1437,7 @@ static fmt_t *register_tag(convert_t *convert, char *key, int is_gtf, int type)
13321437
case T_VKX: fmt->handler = &process_variantkey_hex; break;
13331438
case T_PBINOM: fmt->handler = &process_pbinom; convert->max_unpack |= BCF_UN_FMT; break;
13341439
case T_NPASS: fmt->handler = &process_npass; fmt->destroy = &destroy_npass; break;
1440+
case T_FILTER_EXPR: fmt->handler = &process_filter_expr; fmt->destroy = &destroy_filter_expr; break;
13351441
default: error("TODO: handler for type %d\n", fmt->type);
13361442
}
13371443
if ( key && fmt->type==T_INFO )
@@ -1367,7 +1473,8 @@ static char *parse_tag(convert_t *convert, char *p, int is_gtf)
13671473
kputsn(p, q-p, &str);
13681474
if ( is_gtf )
13691475
{
1370-
if ( !strcmp(str.s, "SAMPLE") ) register_tag(convert, "SAMPLE", is_gtf, T_SAMPLE);
1476+
_SET_FILTER_EXPR(convert,set_filter_expr,p,q,1)
1477+
else if ( !strcmp(str.s, "SAMPLE") ) register_tag(convert, "SAMPLE", is_gtf, T_SAMPLE);
13711478
else if ( !strcmp(str.s, "GT") ) register_tag(convert, "GT", is_gtf, T_GT);
13721479
else if ( !strcmp(str.s, "TGT") ) register_tag(convert, "GT", is_gtf, T_TGT);
13731480
else if ( !strcmp(str.s, "TBCSQ") )
@@ -1422,6 +1529,7 @@ static char *parse_tag(convert_t *convert, char *p, int is_gtf)
14221529
else
14231530
{
14241531
_SET_NON_FORMAT_TAGS(register_tag, str.s, convert, str.s, is_gtf)
1532+
else _SET_FILTER_EXPR(convert,set_filter_expr,p,q,0)
14251533
else if ( !strcmp(str.s, "ALT") )
14261534
{
14271535
fmt_t *fmt = register_tag(convert, str.s, is_gtf, T_ALT);

doc/bcftools.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -405,7 +405,7 @@ Add or remove annotations.
405405
^INFO/TAG .. transfer all INFO annotations except "TAG"
406406

407407
TAG .. add or overwrite existing target value if source is not "." and skip otherwise
408-
+TAG .. add or overwrite existing target value only it is "."
408+
+TAG .. add or overwrite existing target value only if it is "."
409409
.TAG .. add or overwrite existing target value even if source is "."
410410
.+TAG .. add new but never overwrite existing tag, regardless of its value; can transfer "." if target does not exist
411411
-TAG .. overwrite existing value, never add new if target does not exist
@@ -3199,6 +3199,7 @@ Extracts fields from VCF or BCF files and outputs them in user-defined format.
31993199
%FIRST_ALT Alias for %ALT{0}
32003200
%FORMAT Prints all FORMAT fields or a subset of samples with -s or -S
32013201
%GT Genotype (e.g. 0/1)
3202+
%FUNCTION Functions supported by the -i/-e filtering expressions (e.g. "[ %sSUM(FMT/AD)] %SUM(FMT/AD) %SUM(INFO/AD)")
32023203
%INFO Prints the whole INFO column
32033204
%INFO/TAG Any tag in the INFO column
32043205
%IUPACGT Genotype translated to IUPAC ambiguity codes (e.g. M instead of C/A)

filter.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -219,7 +219,7 @@ static int filters_next_token(char **str, int *len)
219219
if ( !strncasecmp(tmp,"STDEV(",6) ) { (*str) += 5; return TOK_STDEV; }
220220
if ( !strncasecmp(tmp,"SUM(",4) ) { (*str) += 3; return TOK_SUM; }
221221
if ( !strncasecmp(tmp,"ABS(",4) ) { (*str) += 3; return TOK_ABS; }
222-
if ( !strncasecmp(tmp,"COUNT(",4) ) { (*str) += 5; return TOK_CNT; }
222+
if ( !strncasecmp(tmp,"COUNT(",6) ) { (*str) += 5; return TOK_CNT; }
223223
if ( !strncasecmp(tmp,"STRLEN(",7) ) { (*str) += 6; return TOK_LEN; }
224224
if ( !strncasecmp(tmp,"BINOM(",6) ) { (*str) += 5; return -TOK_BINOM; }
225225
if ( !strncasecmp(tmp,"PHRED(",6) ) { (*str) += 5; return TOK_PHRED; }

test/query.func.1.1.out

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
1:150 0,0,0 0
2+
1:153 0,0,21 21
3+
1:154 30,13,0 43
4+
1:155 20,0,10 30

test/query.func.1.2.out

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
1:150 . . 0
2+
1:153 .,.,. 0,0,210 210
3+
1:154 30,13,0 300,13,0 356
4+
1:155 20,0,10 200,0,10 240

test/query.func.1.3.out

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
1:150 . . 0 0
2+
1:153 .,.,. 0,0,210 210 210
3+
1:154 30,13,0 300,13,0 356 356
4+
1:155 20,0,10 200,0,10 240 240

test/query.func.1.4.out

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
1:150 . . nan nan
2+
1:153 .,.,. 0,0,210 nan 210
3+
1:154 30,13,0 300,13,0 43 313
4+
1:155 20,0,10 200,0,10 30 210

test/query.func.1.vcf

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
##fileformat=VCFv4.3
2+
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
3+
##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Number of reads supporting the allele">
4+
##INFO=<ID=AD,Number=R,Type=Integer,Description="Number of reads supporting the allele">
5+
##contig=<ID=1,assembly=b37,length=249250621>
6+
##reference=file:///lustre/scratch105/projects/g1k/ref/main_project/human_g1k_v37.fasta
7+
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A B
8+
1 150 . C T . . AD=0,0,0 AD . .
9+
1 153 . C T . . AD=0,0,21 AD .,.,. 0,0,210
10+
1 154 . C T . . AD=30,13,0 AD 30,13,0 300,13,0
11+
1 155 . C T . . AD=20,0,10 AD 20,0,10 200,0,10

test/test.pl

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,10 @@
110110
run_test(\&test_vcf_merge,$opts,in=>['merge.gvcf.5.a','merge.gvcf.5.b'],out=>'merge.gvcf.5.1.out',args=>'--gvcf - --merge none');
111111
run_test(\&test_vcf_merge,$opts,in=>['merge.gvcf.11.a','merge.gvcf.11.b','merge.gvcf.11.c'],out=>'merge.gvcf.11.1.out',args=>'--gvcf -');
112112
# run_test(\&test_vcf_merge_big,$opts,in=>'merge_big.1',out=>'merge_big.1.1',nsmpl=>79000,nfiles=>79,nalts=>486,args=>''); # commented out for speed
113+
run_test(\&test_vcf_query,$opts,in=>'query.func.1',out=>'query.func.1.1.out',args=>q[-f '%CHROM:%POS\\t%INFO/AD\\t%SUM(INFO/AD)']);
114+
run_test(\&test_vcf_query,$opts,in=>'query.func.1',out=>'query.func.1.2.out',args=>q[-f '%CHROM:%POS\\t[%AD ]\\t%SUM(FORMAT/AD)']);
115+
run_test(\&test_vcf_query,$opts,in=>'query.func.1',out=>'query.func.1.3.out',args=>q[-f '%CHROM:%POS\\t[%AD ]\\t[ %SUM(FORMAT/AD)]']);
116+
run_test(\&test_vcf_query,$opts,in=>'query.func.1',out=>'query.func.1.4.out',args=>q[-f '%CHROM:%POS\\t[%AD ]\\t[ %sSUM(FORMAT/AD)]']);
113117
run_test(\&test_vcf_query,$opts,in=>'query.string',out=>'query.string.1.out',args=>q[-f '%CHROM\\t%POS\\t%CLNREVSTAT\\n' -i'CLNREVSTAT="criteria_provided,_conflicting_interpretations"']);
114118
run_test(\&test_vcf_query,$opts,in=>'query.string',out=>'query.string.1.out',args=>q[-f '%CHROM\\t%POS\\t%CLNREVSTAT\\n' -i'CLNREVSTAT="criteria_provided" || CLNREVSTAT="_conflicting_interpretations"']);
115119
run_test(\&test_vcf_query,$opts,in=>'query.string',out=>'query.string.2.out',args=>q[-f '%CHROM\\t%POS\\t%CLNREVSTAT\\n' -i'CLNREVSTAT="criteria_provided" && CLNREVSTAT="_conflicting_interpretations"']);

0 commit comments

Comments
 (0)