diff --git a/CHANGES.md b/CHANGES.md index 5f77972..8f82483 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,3 +1,7 @@ +### 1.13.0 +* bam_stats - Unit tests for C code +* bam_stats - Fix to median insert size calculation + ### 1.11.0 * bam_stats - new rna switch to give more appropriate insert size stats * bam_stats - more robust handling of optional RG header entries diff --git a/MANIFEST b/MANIFEST index 55b51a1..531bf09 100644 --- a/MANIFEST +++ b/MANIFEST @@ -10,6 +10,15 @@ bin/xml_to_bas.pl c/bam_access.c c/bam_access.h c/bam_stats.c +c/bam_stats_calcs.c +c/bam_stats_calcs.h +c/bam_stats_output.c +c/bam_stats_output.h +c/c_tests/bam_access_tests.c +c/c_tests/bam_stats_calcs_tests.c +c/c_tests/bam_stats_output_tests.c +c/c_tests/minunit.h +c/c_tests/runtests.sh c/dbg.h c/khash.h CHANGES.md @@ -68,6 +77,7 @@ t/data/paired.bam t/data/reconcile_bas.bam t/data/Stats.bam t/data/Stats.bam.bas +t/data/Stats.c.bam.bas t/data/test.bam.bas t/data/unpaired.bam t/pcap.t diff --git a/c/Makefile b/c/Makefile index c0d9c94..5b6474f 100644 --- a/c/Makefile +++ b/c/Makefile @@ -30,9 +30,9 @@ LFLAGS?= -L$(HTSTMP) LIBS =-lhts -lpthread -lz -lm # define the C source files -SRCS = ./bam_access.c ./bam_stats.c +SRCS = ./bam_access.c ./bam_stats_output.c ./bam_stats_calcs.c #Define test sources -TEST_SRC=$(wildcard ./tests/*_tests.c) +TEST_SRC=$(wildcard ./c_tests/*_tests.c) TESTS=$(patsubst %.c,%,$(TEST_SRC)) # define the C object files @@ -60,17 +60,17 @@ BAM_STATS_TARGET=../bin/bam_stats .NOTPARALLEL: test -all: clean make_htslib_tmp $(BAM_STATS_TARGET) remove_htslib_tmp +all: clean make_htslib_tmp $(BAM_STATS_TARGET) test remove_htslib_tmp @echo bam_stats compiled. $(BAM_STATS_TARGET): $(OBJS) - $(CC) $(CFLAGS) $(INCLUDES) -o $(BAM_STATS_TARGET) $(OBJS) $(LFLAGS) $(LIBS) + $(CC) $(CFLAGS) $(INCLUDES) -o $(BAM_STATS_TARGET) $(OBJS) $(LFLAGS) $(LIBS) ./bam_stats.c #Unit Tests test: $(BAM_STATS_TARGET) test: CFLAGS += $(INCLUDES) $(OBJS) $(LFLAGS) $(LIBS) test: $(TESTS) - sh ./tests/runtests.sh + sh ./c_tests/runtests.sh #Unit tests with coverage coverage: CFLAGS += --coverage diff --git a/c/bam_access.c b/c/bam_access.c index c306dc9..67efa17 100644 --- a/c/bam_access.c +++ b/c/bam_access.c @@ -1,6 +1,6 @@ /* LICENCE * PCAP - NGS reference implementations and helper code for the ICGC/TCGA Pan-Cancer Analysis Project -* Copyright (C) 2014 ICGC PanCancer Project +* Copyright (C) 2014-2016 ICGC PanCancer Project * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -21,6 +21,7 @@ #include #include #include "bam_access.h" +#include "bam_stats_calcs.h" int get_rg_index_from_rg_store(rg_info_t **grps, char *rg, int grps_size){ int i=0; @@ -35,9 +36,9 @@ void parse_rg_line(char *tmp_line, rg_info_t *group) { char *tag = strtok(tmp_line,"\t"); assert(strcmp(tag,"@RG")==0); group->id = strdup("\0"); - group->sample = strdup("\0"); - group->platform = strdup("\0"); - group->platform_unit = strdup("\0"); + group->sample = strdup("\0"); + group->platform = strdup("\0"); + group->platform_unit = strdup("\0"); group->lib = strdup("\0"); tag = strtok(NULL,"\t"); while(tag != NULL){ @@ -55,7 +56,7 @@ void parse_rg_line(char *tmp_line, rg_info_t *group) { return; } -rg_info_t **parse_header(bam_hdr_t *head, int *grps_size, stats_rd_t ****grp_stats){ +rg_info_t **bam_access_parse_header(bam_hdr_t *head, int *grps_size, stats_rd_t ****grp_stats){ assert(head != NULL); char *line = NULL; rg_info_t **groups; @@ -150,7 +151,7 @@ rg_info_t **parse_header(bam_hdr_t *head, int *grps_size, stats_rd_t ****grp_sta return NULL; } -int process_reads(htsFile *input, bam_hdr_t *head, rg_info_t **grps, int grps_size, stats_rd_t ****grp_stats, int rna){ +int bam_access_process_reads(htsFile *input, bam_hdr_t *head, rg_info_t **grps, int grps_size, stats_rd_t ****grp_stats, int rna){ assert(input != NULL); assert(head != NULL); assert(grps != NULL); @@ -206,12 +207,12 @@ int process_reads(htsFile *input, bam_hdr_t *head, rg_info_t **grps, int grps_si uint32_t nm_val = bam_aux2i(nm); if(nm_val>0){ (*grp_stats)[rg_index][read]->divergent += nm_val; - (*grp_stats)[rg_index][read]->mapped_bases += get_mapped_base_count_from_cigar(b); + (*grp_stats)[rg_index][read]->mapped_bases += bam_access_get_mapped_base_count_from_cigar(b); }else{ (*grp_stats)[rg_index][read]->mapped_bases += (bam_endpos(b) - b->core.pos) + 1; } }else{ - (*grp_stats)[rg_index][read]->mapped_bases += get_mapped_base_count_from_cigar(b); + (*grp_stats)[rg_index][read]->mapped_bases += bam_access_get_mapped_base_count_from_cigar(b); } // Insert size can only be calculated based on reads that are on same chr @@ -238,7 +239,7 @@ int process_reads(htsFile *input, bam_hdr_t *head, rg_info_t **grps, int grps_si return -1; } -uint64_t get_mapped_base_count_from_cigar(bam1_t *b){ +uint64_t bam_access_get_mapped_base_count_from_cigar(bam1_t *b){ #define _cop(c) ((c)&BAM_CIGAR_MASK) #define _cln(c) ((c)>>BAM_CIGAR_SHIFT) assert(b != NULL); diff --git a/c/bam_access.h b/c/bam_access.h index 3dbccc9..471fd65 100644 --- a/c/bam_access.h +++ b/c/bam_access.h @@ -1,6 +1,6 @@ /* LICENCE * PCAP - NGS reference implementations and helper code for the ICGC/TCGA Pan-Cancer Analysis Project -* Copyright (C) 2014 ICGC PanCancer Project +* Copyright (C) 2014-2016 ICGC PanCancer Project * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -25,8 +25,8 @@ #include #include "htslib/sam.h" #include "dbg.h" - #include "khash.h" + KHASH_MAP_INIT_INT(ins,uint64_t) //KHASH_INIT2(ins,, khint32_t, uint64_t, 1, kh_int_hash_func, kh_int_hash_equal) @@ -52,10 +52,10 @@ typedef struct{ char *sample; } rg_info_t; -rg_info_t **parse_header(bam_hdr_t *head, int *grps_size, stats_rd_t ****grp_stats); +rg_info_t **bam_access_parse_header(bam_hdr_t *head, int *grps_size, stats_rd_t ****grp_stats); -int process_reads(htsFile *input, bam_hdr_t *head, rg_info_t **grps, int grps_size, stats_rd_t ****grp_stats, int rna); +int bam_access_process_reads(htsFile *input, bam_hdr_t *head, rg_info_t **grps, int grps_size, stats_rd_t ****grp_stats, int rna); -uint64_t get_mapped_base_count_from_cigar(bam1_t *b); +uint64_t bam_access_get_mapped_base_count_from_cigar(bam1_t *b); #endif diff --git a/c/bam_stats.c b/c/bam_stats.c index 334d36c..66958b7 100644 --- a/c/bam_stats.c +++ b/c/bam_stats.c @@ -1,6 +1,6 @@ -/* LICENCE +/*########LICENCE######### * PCAP - NGS reference implementations and helper code for the ICGC/TCGA Pan-Cancer Analysis Project -* Copyright (C) 2014 ICGC PanCancer Project +* Copyright (C) 2014-2016 ICGC PanCancer Project * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -15,7 +15,7 @@ * You should have received a copy of the GNU General Public License * along with this program; if not see: * http://www.gnu.org/licenses/gpl-2.0.html -*/ +*#########LICENCE#########*/ #include @@ -24,12 +24,9 @@ #include #include #include -#include -#include #include "dbg.h" -#include -#include - +#include "bam_access.h" +#include "bam_stats_output.h" #include "khash.h" @@ -39,8 +36,6 @@ static char *ref_file = NULL; static int rna = 0; int grps_size = 0; stats_rd_t*** grp_stats; -static char *bas_header = "bam_filename\tsample\tplatform\tplatform_unit\tlibrary\treadgroup\tread_length_r1\tread_length_r2\t#_mapped_bases\t#_mapped_bases_r1\t#_mapped_bases_r2\t#_divergent_bases\t#_divergent_bases_r1\t#_divergent_bases_r2\t#_total_reads\t#_total_reads_r1\t#_total_reads_r2\t#_mapped_reads\t#_mapped_reads_r1\t#_mapped_reads_r2\t#_mapped_reads_properly_paired\t#_gc_bases_r1\t#_gc_bases_r2\tmean_insert_size\tinsert_size_sd\tmedian_insert_size\t#_duplicate_reads\n"; -static char *rg_line_pattern = "%s\t%s\t%s\t%s\t%s\t%s\t%"PRIu32"\t%"PRIu32"\t%"PRIu64"\t%"PRIu64"\t%"PRIu64"\t%"PRIu64"\t%"PRIu64"\t%"PRIu64"\t%"PRIu64"\t%"PRIu64"\t%"PRIu64"\t%"PRIu64"\t%"PRIu64"\t%"PRIu64"\t%"PRIu64"\t%"PRIu64"\t%"PRIu64"\t%.3f\t%.3f\t%.3f\t%"PRIu64"\n"; int check_exist(char *fname){ FILE *fp; @@ -152,183 +147,15 @@ void options(int argc, char *argv[]){ return; } -int calculate_mean_sd_median_insert_size(khash_t(ins) *inserts,double *mean, double *sd, double *median){ - - uint64_t pp_mean = 0; - uint64_t tt_mean = 0; - uint32_t key; - uint64_t val; - - kh_foreach(inserts,key,val, - { pp_mean += key * val; - tt_mean += val; - }); - - if(tt_mean){//Calculate mean , median, sd - *mean = (double) ((double)pp_mean/(double)tt_mean); - - float midpoint = (float)((float)tt_mean / (float)2); - float midpoint2 = (float)((float)tt_mean / (float)2 + (float)1); - uint32_t insert = 0; - uint32_t prev_insert = 0; - uint64_t running_total = 0; - - kh_foreach(inserts,key,val, - { insert = key; - running_total += val; - if(running_total >= midpoint) break; - prev_insert = key; - }); - - if(tt_mean %2 == 0 && ( running_total - midpoint2 >= insert )){ - //warn "Thinks is even AND split between bins "; - *median = (((double)insert + (double)prev_insert) / (double)2); - }else{ - //warn "Thinks is odd or NOT split between bins"; - *median = (double)(insert); - } - - //We have mean and median so calculate the SD - uint64_t pp_sd = 0; - uint64_t tt_sd = 0; - - kh_foreach(inserts,key,val, - { double diff = (double)(key) - (*mean); - pp_sd += (diff * diff) * (double)val; - tt_sd += val; - }); - - - - if(tt_sd){ - double variance = fabs((double)((double)pp_sd / (double)tt_sd)); - *sd = sqrt(variance); - }else{ - *sd = 0; - } - - kh_destroy(ins, inserts); - - } //End of if we have data to calculate from. - return 0; -} - -int print_results(rg_info_t **grps){ - FILE *out; - if (strcmp(output_file,"-")==0) { - out = stdout; - } else { - out = fopen(output_file,"w"); - } - check(out != NULL,"Error trying to open output file %s for writing.",output_file); - - int chk = fprintf(out,"%s",bas_header); - check(chk==strlen(bas_header),"Error writing bas_header to output file."); - - //Iterate through each RG - int i=0; - for(i=0;icount==0 && grp_stats[i][1]->count==0) continue; // Skip empty read groups stats - uint64_t unmapped_r1 = grp_stats[i][0]->umap; - uint64_t unmapped_r2 = grp_stats[i][1]->umap; - uint64_t unmapped = grp_stats[i][0]->umap + grp_stats[i][1]->umap; - - uint64_t total_reads_r1 = grp_stats[i][0]->count; - uint64_t total_reads_r2 = grp_stats[i][1]->count; - uint64_t total_reads = grp_stats[i][0]->count + grp_stats[i][1]->count; - - uint64_t mapped_reads = total_reads - unmapped; - - uint64_t gc_r1 = grp_stats[i][0]->gc; - uint64_t gc_r2 = grp_stats[i][1]->gc; - - uint64_t mapped_reads_r1 = 0; - uint64_t mapped_reads_r2 = 0; - uint64_t proper_pairs = 0; - uint64_t mapped_bases = 0; - uint64_t mapped_bases_r1 = 0; - uint64_t mapped_bases_r2 = 0; - uint64_t divergent_bases = 0; - uint64_t divergent_bases_r1 = 0; - uint64_t divergent_bases_r2 = 0; - uint64_t dup_reads = 0; - - double mean_insert_size = 0; - double insert_size_sd = 0; - double median_insert_size = 0; - - if(mapped_reads>0){ - //Only need group one as they are pairs - proper_pairs = grp_stats[i][0]->proper; - mapped_reads_r1 = total_reads_r1 - unmapped_r1; - mapped_reads_r2 = total_reads_r2 - unmapped_r2; - - mapped_bases_r1 = grp_stats[i][0]->mapped_bases; - mapped_bases_r2 = grp_stats[i][1]->mapped_bases; - mapped_bases = grp_stats[i][0]->mapped_bases + grp_stats[i][1]->mapped_bases; - - divergent_bases_r1 = grp_stats[i][0]->divergent; - divergent_bases_r2 = grp_stats[i][1]->divergent; - divergent_bases = grp_stats[i][0]->divergent + grp_stats[i][1]->divergent; - - calculate_mean_sd_median_insert_size(grp_stats[i][0]->inserts,&mean_insert_size,&insert_size_sd,&median_insert_size); - dup_reads = grp_stats[i][0]->dups + grp_stats[i][1]->dups; - } - - uint32_t read_length_r1 = grp_stats[i][0]->length; - uint32_t read_length_r2 = grp_stats[i][1]->length; - - char *file = basename(input_file); - - chk = fprintf(out,rg_line_pattern, - file, - grps[i]->sample, - grps[i]->platform, - grps[i]->platform_unit, - grps[i]->lib, - grps[i]->id, - read_length_r1, - read_length_r2, - mapped_bases, - mapped_bases_r1, - mapped_bases_r2, - divergent_bases, - divergent_bases_r1, - divergent_bases_r2, - total_reads, - total_reads_r1, - total_reads_r2, - mapped_reads, - mapped_reads_r1, - mapped_reads_r2, - proper_pairs, - gc_r1, - gc_r2, - mean_insert_size, - insert_size_sd, - median_insert_size, - dup_reads); - - check(chk>0,"Error writing bas line to output file."); - fflush(out); - } - - if (out != stdout) fclose(out); - return 0; - -error: - if(out && out != stdout) fclose(out); - return -1; - -} - int main(int argc, char *argv[]){ options(argc, argv); - htsFile *input; - bam_hdr_t *head; - + htsFile *input = NULL; + bam_hdr_t *head = NULL; + rg_info_t **grps = NULL; //Open bam file as object input = hts_open(input_file,"r"); + check(input != NULL, "Error opening hts file for reading '%s'.",input_file); + //Set reference index file if(ref_file){ hts_set_fai_filename(input, ref_file); @@ -338,14 +165,17 @@ int main(int argc, char *argv[]){ //Read header from bam file head = sam_hdr_read(input); - rg_info_t **grps = parse_header(head, &grps_size, &grp_stats); + check(head != NULL, "Error reading header from opened hts file '%s'.",input_file); + + + grps = bam_access_parse_header(head, &grps_size, &grp_stats); check(grps != NULL, "Error fetching read groups from header."); //Process every read in bam file. - int check = process_reads(input,head,grps, grps_size, &grp_stats, rna); + int check = bam_access_process_reads(input,head,grps, grps_size, &grp_stats, rna); check(check==0,"Error processing reads in bam file."); - int res = print_results(grps); + int res = bam_stats_output_print_results(grps,grps_size,grp_stats,input_file,output_file); check(res==0,"Error writing bam_stats output to file."); bam_hdr_destroy(head); diff --git a/c/bam_stats_calcs.c b/c/bam_stats_calcs.c new file mode 100644 index 0000000..211bb34 --- /dev/null +++ b/c/bam_stats_calcs.c @@ -0,0 +1,111 @@ +/* LICENCE +* PCAP - NGS reference implementations and helper code for the ICGC/TCGA Pan-Cancer Analysis Project +* Copyright (C) 2014-2016 ICGC PanCancer Project +* +* This program is free software; you can redistribute it and/or +* modify it under the terms of the GNU General Public License +* as published by the Free Software Foundation; either version 2 +* of the License, or (at your option) any later version. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with this program; if not see: +* http://www.gnu.org/licenses/gpl-2.0.html +*/ + +#include +#include +#include +#include "bam_access.h" + +int compare( const void* a, const void* b) +{ + uint64_t int_a = * ( (uint64_t*) a ); + uint64_t int_b = * ( (uint64_t*) b ); + if ( int_a == int_b ) return 0; + else if ( int_a < int_b ) return -1; + else return 1; +} + +int bam_stats_calcs_calculate_mean_sd_median_insert_size(khash_t(ins) *inserts,double *mean, double *sd, double *median){ + + uint64_t pp_mean = 0; + uint64_t tt_mean = 0; + uint64_t key; + uint64_t val; + + uint64_t *insert_bins; + + insert_bins = malloc(sizeof(uint64_t) * kh_size(inserts)); + + int i=0; + kh_foreach(inserts,key,val, + { pp_mean += key * val; + tt_mean += val; + insert_bins[i] = key; + i++; + }); + + if(tt_mean){//Calculate mean , median, sd + *mean = (double) ((double)pp_mean/(double)tt_mean); + + + //sort the array of insert sizes. + qsort( insert_bins, kh_size(inserts), sizeof(uint64_t), compare ); + + int midpoint2 = (int)(tt_mean / 2); + int midpoint = (midpoint2 + 1); + uint64_t insert = 0; + uint64_t prev_insert = 0; + uint64_t running_total = 0; + uint64_t current_bin_count = 0; + + + int j=0; + for(j=0; j<=kh_size(inserts); j++){ + insert = insert_bins[j]; // The insert size... + khint_t k; + k = kh_get(ins,inserts,insert); + uint64_t val = kh_val(inserts,k); + running_total += val; + current_bin_count = val; + if(running_total >= midpoint) break; + prev_insert = insert_bins[j]; + } + + if(tt_mean %2 == 0 && ( running_total - midpoint2 >= current_bin_count )){ + //warn "Thinks is even AND split between bins "; + *median = (((double)insert + (double)prev_insert) / (double)2); + }else{ + //warn "Thinks is odd or NOT split between bins"; + *median = (double)(insert); + } + + //We have mean and median so calculate the SD + uint64_t pp_sd = 0; + uint64_t tt_sd = 0; + + kh_foreach(inserts,key,val, + { double diff = (double)(key) - (*mean); + pp_sd += (diff * diff) * (double)val; + tt_sd += val; + }); + + + + if(tt_sd){ + double variance = fabs((double)((double)pp_sd / (double)tt_sd)); + *sd = sqrt(variance); + }else{ + *sd = 0; + } + + kh_destroy(ins, inserts); + + } //End of if we have data to calculate from. + return 0; +} \ No newline at end of file diff --git a/c/bam_stats_calcs.h b/c/bam_stats_calcs.h new file mode 100644 index 0000000..96c095d --- /dev/null +++ b/c/bam_stats_calcs.h @@ -0,0 +1,27 @@ +/* LICENCE +* PCAP - NGS reference implementations and helper code for the ICGC/TCGA Pan-Cancer Analysis Project +* Copyright (C) 2014-2016 ICGC PanCancer Project +* +* This program is free software; you can redistribute it and/or +* modify it under the terms of the GNU General Public License +* as published by the Free Software Foundation; either version 2 +* of the License, or (at your option) any later version. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with this program; if not see: +* http://www.gnu.org/licenses/gpl-2.0.html +*/ + +#ifndef __bam_stats_calcs_h__ +#define __bam_stats_calcs_h__ + +#include "bam_access.h" + +int bam_stats_calcs_calculate_mean_sd_median_insert_size(khash_t(ins) *inserts,double *mean, double *sd, double *median); + +#endif \ No newline at end of file diff --git a/c/bam_stats_output.c b/c/bam_stats_output.c new file mode 100644 index 0000000..17a01e4 --- /dev/null +++ b/c/bam_stats_output.c @@ -0,0 +1,137 @@ +/* LICENCE +* PCAP - NGS reference implementations and helper code for the ICGC/TCGA Pan-Cancer Analysis Project +* Copyright (C) 2014-2016 ICGC PanCancer Project +* +* This program is free software; you can redistribute it and/or +* modify it under the terms of the GNU General Public License +* as published by the Free Software Foundation; either version 2 +* of the License, or (at your option) any later version. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with this program; if not see: +* http://www.gnu.org/licenses/gpl-2.0.html +*/ + +#include +#include +#include "bam_stats_output.h" +#include "bam_stats_calcs.h" + +static char *bas_header = "bam_filename\tsample\tplatform\tplatform_unit\tlibrary\treadgroup\tread_length_r1\tread_length_r2\t#_mapped_bases\t#_mapped_bases_r1\t#_mapped_bases_r2\t#_divergent_bases\t#_divergent_bases_r1\t#_divergent_bases_r2\t#_total_reads\t#_total_reads_r1\t#_total_reads_r2\t#_mapped_reads\t#_mapped_reads_r1\t#_mapped_reads_r2\t#_mapped_reads_properly_paired\t#_gc_bases_r1\t#_gc_bases_r2\tmean_insert_size\tinsert_size_sd\tmedian_insert_size\t#_duplicate_reads\n"; +static char *rg_line_pattern = "%s\t%s\t%s\t%s\t%s\t%s\t%"PRIu32"\t%"PRIu32"\t%"PRIu64"\t%"PRIu64"\t%"PRIu64"\t%"PRIu64"\t%"PRIu64"\t%"PRIu64"\t%"PRIu64"\t%"PRIu64"\t%"PRIu64"\t%"PRIu64"\t%"PRIu64"\t%"PRIu64"\t%"PRIu64"\t%"PRIu64"\t%"PRIu64"\t%.3f\t%.3f\t%.3f\t%"PRIu64"\n"; + + +int bam_stats_output_print_results(rg_info_t **grps,int grps_size,stats_rd_t*** grp_stats,char *input_file,char *output_file){ + FILE *out = NULL; + check(output_file != NULL, "Output file was NULL"); + if (strcmp(output_file,"-")==0) { + out = stdout; + } else { + out = fopen(output_file,"w"); + } + check(out != NULL,"Error trying to open output file %s for writing.",output_file); + + int chk = fprintf(out,"%s",bas_header); + check(chk==strlen(bas_header),"Error writing bas_header to output file."); + + //Iterate through each RG + int i=0; + for(i=0;icount==0 && grp_stats[i][1]->count==0) continue; // Skip empty read groups stats + uint64_t unmapped_r1 = grp_stats[i][0]->umap; + uint64_t unmapped_r2 = grp_stats[i][1]->umap; + uint64_t unmapped = grp_stats[i][0]->umap + grp_stats[i][1]->umap; + + uint64_t total_reads_r1 = grp_stats[i][0]->count; + uint64_t total_reads_r2 = grp_stats[i][1]->count; + uint64_t total_reads = grp_stats[i][0]->count + grp_stats[i][1]->count; + + uint64_t mapped_reads = total_reads - unmapped; + + uint64_t gc_r1 = grp_stats[i][0]->gc; + uint64_t gc_r2 = grp_stats[i][1]->gc; + + uint64_t mapped_reads_r1 = 0; + uint64_t mapped_reads_r2 = 0; + uint64_t proper_pairs = 0; + uint64_t mapped_bases = 0; + uint64_t mapped_bases_r1 = 0; + uint64_t mapped_bases_r2 = 0; + uint64_t divergent_bases = 0; + uint64_t divergent_bases_r1 = 0; + uint64_t divergent_bases_r2 = 0; + uint64_t dup_reads = 0; + + double mean_insert_size = 0; + double insert_size_sd = 0; + double median_insert_size = 0; + + if(mapped_reads>0){ + //Only need group one as they are pairs + proper_pairs = grp_stats[i][0]->proper; + mapped_reads_r1 = total_reads_r1 - unmapped_r1; + mapped_reads_r2 = total_reads_r2 - unmapped_r2; + + mapped_bases_r1 = grp_stats[i][0]->mapped_bases; + mapped_bases_r2 = grp_stats[i][1]->mapped_bases; + mapped_bases = grp_stats[i][0]->mapped_bases + grp_stats[i][1]->mapped_bases; + + divergent_bases_r1 = grp_stats[i][0]->divergent; + divergent_bases_r2 = grp_stats[i][1]->divergent; + divergent_bases = grp_stats[i][0]->divergent + grp_stats[i][1]->divergent; + + bam_stats_calcs_calculate_mean_sd_median_insert_size(grp_stats[i][0]->inserts,&mean_insert_size,&insert_size_sd,&median_insert_size); + dup_reads = grp_stats[i][0]->dups + grp_stats[i][1]->dups; + } + + uint32_t read_length_r1 = grp_stats[i][0]->length; + uint32_t read_length_r2 = grp_stats[i][1]->length; + + char *file = basename(input_file); + + chk = fprintf(out,rg_line_pattern, + file, + grps[i]->sample, + grps[i]->platform, + grps[i]->platform_unit, + grps[i]->lib, + grps[i]->id, + read_length_r1, + read_length_r2, + mapped_bases, + mapped_bases_r1, + mapped_bases_r2, + divergent_bases, + divergent_bases_r1, + divergent_bases_r2, + total_reads, + total_reads_r1, + total_reads_r2, + mapped_reads, + mapped_reads_r1, + mapped_reads_r2, + proper_pairs, + gc_r1, + gc_r2, + mean_insert_size, + insert_size_sd, + median_insert_size, + dup_reads); + + check(chk>0,"Error writing bas line to output file."); + fflush(out); + } + + if (out != stdout) fclose(out); + return 0; + +error: + if(out && out != stdout) fclose(out); + return -1; + +} \ No newline at end of file diff --git a/c/bam_stats_output.h b/c/bam_stats_output.h new file mode 100644 index 0000000..0e90391 --- /dev/null +++ b/c/bam_stats_output.h @@ -0,0 +1,27 @@ +/* LICENCE +* PCAP - NGS reference implementations and helper code for the ICGC/TCGA Pan-Cancer Analysis Project +* Copyright (C) 2014 ICGC PanCancer Project +* +* This program is free software; you can redistribute it and/or +* modify it under the terms of the GNU General Public License +* as published by the Free Software Foundation; either version 2 +* of the License, or (at your option) any later version. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with this program; if not see: +* http://www.gnu.org/licenses/gpl-2.0.html +*/ + +#ifndef __bam_stats_output_h__ +#define __bam_stats_output_h__ + +#include "bam_access.h" + +int bam_stats_output_print_results(rg_info_t **grps,int grps_size,stats_rd_t*** grp_stats,char *input_file,char *output_file); + +#endif \ No newline at end of file diff --git a/c/c_tests/bam_access_tests.c b/c/c_tests/bam_access_tests.c new file mode 100644 index 0000000..d6af59f --- /dev/null +++ b/c/c_tests/bam_access_tests.c @@ -0,0 +1,602 @@ +/*########LICENCE######### +* PCAP - NGS reference implementations and helper code for the ICGC/TCGA Pan-Cancer Analysis Project +* Copyright (C) 2014,2015,2016 ICGC PanCancer Project +* +* This program is free software; you can redistribute it and/or +* modify it under the terms of the GNU General Public License +* as published by the Free Software Foundation; either version 2 +* of the License, or (at your option) any later version. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with this program; if not see: +* http://www.gnu.org/licenses/gpl-2.0.html +*#########LICENCE#########*/ + +#include +#include "minunit.h" +#include "bam_access.h" + +#ifndef KSTRING_T +#define KSTRING_T kstring_t +typedef struct __kstring_t { + size_t l, m; + char *s; +} kstring_t; +#endif + +static inline int kputsn(const char *p, int l, kstring_t *s) +{ + if (s->l + l + 1 >= s->m) { + s->m = s->l + l + 2; + kroundup32(s->m); + s->s = (char*)realloc(s->s, s->m); + } + memcpy(s->s + s->l, p, l); + s->l += l; + s->s[s->l] = 0; + return l; +} + + +static inline int kputs(const char *p, kstring_t *s) +{ + return kputsn(p, strlen(p), s); +} + +#ifndef kroundup32 +#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) +#endif + +char *test_bam = "../t/data/Stats.bam"; +char *exp_plat = "GAII"; +char *exp_sample = "PD1234a"; +char *exp_lib = "PD1234a 140546_1054"; +char *exp_rgid_1 = "29976"; +char *exp_platform_unit_1 = "5178_6"; +char *exp_rgid_2 = "29978"; +char *exp_platform_unit_2 = "5085_6"; +uint32_t exp_rd_length = 20; +uint64_t exp_rg1_rd1_tot_count = 7; +uint64_t exp_rg1_rd2_tot_count = 3; +uint64_t exp_rg2_rd1_tot_count = 3; +uint64_t exp_rg2_rd2_tot_count = 3; +uint64_t exp_rg1_rd1_dups = 4; +uint64_t exp_rg1_rd2_dups = 0; +uint64_t exp_rg2_rd1_dups = 0; +uint64_t exp_rg2_rd2_dups = 0; +uint64_t exp_rg1_rd1_gc = 63; +uint64_t exp_rg1_rd2_gc = 27; +uint64_t exp_rg2_rd1_gc = 27; +uint64_t exp_rg2_rd2_gc = 27; +uint64_t exp_rg1_rd1_umap = 1; +uint64_t exp_rg1_rd2_umap = 1; +uint64_t exp_rg2_rd1_umap = 2; +uint64_t exp_rg2_rd2_umap = 2; +uint64_t exp_rg1_rd1_divergent = 30; +uint64_t exp_rg1_rd2_divergent = 18; +uint64_t exp_rg2_rd1_divergent = 11; +uint64_t exp_rg2_rd2_divergent = 12; +uint64_t exp_rg1_rd1_mapped_bases = 115; +uint64_t exp_rg1_rd2_mapped_bases = 37; +uint64_t exp_rg2_rd1_mapped_bases = 20; +uint64_t exp_rg2_rd2_mapped_bases = 20; +uint64_t exp_rg1_rd1_proper = 6; +uint64_t exp_rg1_rd2_proper = 0; +uint64_t exp_rg2_rd1_proper = 1; +uint64_t exp_rg2_rd2_proper = 0; + +char err[100]; + +char *test_bam_access_parse_header(){ + htsFile *input; + bam_hdr_t *head; + int grps_size = 0; + stats_rd_t*** grp_stats; + //Open bam file as object + input = hts_open(test_bam,"r"); + if (input == NULL){ + sprintf(err,"Error opening bam file %s\n",test_bam); + return err; + } + //Read header from bam file + head = sam_hdr_read(input); + if (head == NULL){ + sprintf(err,"Error reading header from bam file %s\n",test_bam); + return err; + } + //Run header parsing + rg_info_t **grps = bam_access_parse_header(head, &grps_size, &grp_stats); + if(grps_size != 2){ + sprintf(err,"Didn't read two read groups from test bam: %d\n",grps_size); + return err; + } + //Check rg 1 + if(strcmp(grps[0]->id,exp_rgid_1)!=0){ + sprintf(err,"Read group %d expected id %s but got %s.\n",1,exp_rgid_1,grps[0]->id); + return err; + } + if(strcmp(grps[0]->platform_unit,exp_platform_unit_1)!=0){ + sprintf(err,"Read group %d expected platform_unit %s but got %s.\n",2,exp_platform_unit_1,grps[0]->platform_unit); + return err; + } + //Check rg 2 + if(strcmp(grps[1]->id,exp_rgid_2)!=0){ + sprintf(err,"Read group %d expected id %s but got %s.\n",2,exp_rgid_2,grps[1]->id); + return err; + } + if(strcmp(grps[1]->platform_unit,exp_platform_unit_2)!=0){ + sprintf(err,"Read group %d expected platform_unit %s but got %s.\n",2,exp_platform_unit_2,grps[1]->platform_unit); + return err; + } + + //Commmon to both RGs + int i=0; + for (i=0;i<2;i++){ + if(strcmp(grps[i]->platform,exp_plat)!=0){ + sprintf(err,"Read group %d expected platform %s but got %s.\n",(i+1),exp_plat,grps[i]->platform); + return err; + } + if(strcmp(grps[i]->sample,exp_sample)!=0){ + sprintf(err,"Read group %d expected sample %s but got %s.\n",(i+1),exp_sample,grps[i]->sample); + return err; + } + if(strcmp(grps[i]->lib,exp_lib)!= 0){ + sprintf(err,"Read group %d expected lib %s but got %s.\n",(i+1),exp_lib,grps[i]->lib); + return err; + } + } + return NULL; +} + +char *test_bam_access_get_mapped_base_count_from_cigar(){ + htsFile *input; + bam_hdr_t *head; + kstring_t str = {0,0,0}; + char *sample_sam = "IL29_5178:2:54:17473:17010 579 1 9993 0 5S10M5S = 9993 100 CTCTTCCGATCTTTAGGGTT ;?;??>>>>Flength!=exp_rd_length){ + sprintf(err,"Read group %d read_2 length expected %"PRIu32" but got %"PRIu32".\n",(i+1),exp_rd_length,grp_stats[i][0]->length); + return err; + } + if(grp_stats[i][1]->length!=exp_rd_length){ + sprintf(err,"Read group %d read_2 length expected %"PRIu32" but got %"PRIu32".\n",(i+1),exp_rd_length,grp_stats[i][1]->length); + return err; + } + } + + /******RG1 checks*****/ + //Read count 1 + if(grp_stats[0][0]->count!=exp_rg1_rd1_tot_count){ + sprintf(err,"RG 1, read_1 count incorrect. Expected %"PRIu64" but got %"PRIu64"\n",exp_rg1_rd1_tot_count,grp_stats[0][0]->count); + return err; + } + //Read count 2 + if(grp_stats[0][1]->count!=exp_rg1_rd2_tot_count){ + sprintf(err,"RG 1, read_2 count incorrect. Expected %"PRIu64" but got %"PRIu64"\n",exp_rg1_rd2_tot_count,grp_stats[0][1]->count); + return err; + } + //Duplicate reads 1 + if(grp_stats[0][0]->dups!=exp_rg1_rd1_dups){ + sprintf(err,"RG 1, read_1 duplicate count incorrect. Expected %"PRIu64" but got %"PRIu64"\n",exp_rg1_rd1_dups,grp_stats[0][0]->dups); + return err; + } + //Duplicate reads 2 + if(grp_stats[0][1]->dups!=exp_rg1_rd2_dups){ + sprintf(err,"RG 1, read_2 duplicate count incorrect. Expected %"PRIu64" but got %"PRIu64"\n",exp_rg1_rd2_dups,grp_stats[0][1]->dups); + return err; + } + //GC 1 + if(grp_stats[0][0]->gc!=exp_rg1_rd1_gc){ + sprintf(err,"RG 1, read_1 gc count incorrect. Expected %"PRIu64" but got %"PRIu64"\n",exp_rg1_rd1_gc,grp_stats[0][0]->gc); + return err; + } + //GC 2 + if(grp_stats[0][1]->gc!=exp_rg1_rd2_gc){ + sprintf(err,"RG 1, read_2 gc count incorrect. Expected %"PRIu64" but got %"PRIu64"\n",exp_rg1_rd2_gc,grp_stats[0][1]->gc); + return err; + } + //Unmapped 1 + if(grp_stats[0][0]->umap!=exp_rg1_rd1_umap){ + sprintf(err,"RG 1, read_1 umap count incorrect. Expected %"PRIu64" but got %"PRIu64"\n",exp_rg1_rd1_umap,grp_stats[0][0]->umap); + return err; + } + //Unmapped 2 + if(grp_stats[0][1]->umap!=exp_rg1_rd2_umap){ + sprintf(err,"RG 1, read_2 umap count incorrect. Expected %"PRIu64" but got %"PRIu64"\n",exp_rg1_rd2_umap,grp_stats[0][1]->umap); + return err; + } + //divergent 1 + if(grp_stats[0][0]->divergent!=exp_rg1_rd1_divergent){ + sprintf(err,"RG 1, read_1 divergent count incorrect. Expected %"PRIu64" but got %"PRIu64"\n",exp_rg1_rd1_divergent,grp_stats[0][0]->divergent); + return err; + } + //divergent 2 + if(grp_stats[0][1]->divergent!=exp_rg1_rd2_divergent){ + sprintf(err,"RG 1, read_2 divergent count incorrect. Expected %"PRIu64" but got %"PRIu64"\n",exp_rg1_rd2_divergent,grp_stats[0][1]->divergent); + return err; + } + //mapped_bases 1 + if(grp_stats[0][0]->mapped_bases!=exp_rg1_rd1_mapped_bases){ + sprintf(err,"RG 1, read_1 mapped_bases count incorrect. Expected %"PRIu64" but got %"PRIu64"\n",exp_rg1_rd1_mapped_bases,grp_stats[0][0]->mapped_bases); + return err; + } + //mapped_bases 2 + if(grp_stats[0][1]->mapped_bases!=exp_rg1_rd2_mapped_bases){ + sprintf(err,"RG 1, read_2 mapped_bases count incorrect. Expected %"PRIu64" but got %"PRIu64"\n",exp_rg1_rd2_mapped_bases,grp_stats[0][1]->mapped_bases); + return err; + } + //proper 1 + if(grp_stats[0][0]->proper!=exp_rg1_rd1_proper){ + sprintf(err,"RG 1, read_1 proper count incorrect. Expected %"PRIu64" but got %"PRIu64"\n",exp_rg1_rd1_proper,grp_stats[0][0]->proper); + return err; + } + //proper 2 + if(grp_stats[0][1]->proper!=exp_rg1_rd2_proper){ + sprintf(err,"RG 1, read_2 proper count incorrect. Expected %"PRIu64" but got %"PRIu64"\n",exp_rg1_rd2_proper,grp_stats[0][1]->proper); + return err; + } + + + + + /******RG2 checks*****/ + if(grp_stats[1][0]->count!=exp_rg2_rd1_tot_count){ + sprintf(err,"RG 2, read_1 count incorrect. Expected %"PRIu64" but got %"PRIu64"\n",exp_rg2_rd1_tot_count,grp_stats[1][0]->count); + return err; + } + if(grp_stats[1][1]->count!=exp_rg2_rd2_tot_count){ + sprintf(err,"RG 2, read_2 count incorrect. Expected %"PRIu64" but got %"PRIu64"\n",exp_rg2_rd2_tot_count,grp_stats[1][1]->count); + return err; + } + //Duplicate reads 1 + if(grp_stats[1][0]->dups!=exp_rg2_rd1_dups){ + sprintf(err,"RG 1, read_1 duplicate count incorrect. Expected %"PRIu64" but got %"PRIu64"\n",exp_rg2_rd1_dups,grp_stats[1][0]->dups); + return err; + } + //Duplicate reads 2 + if(grp_stats[1][1]->dups!=exp_rg2_rd2_dups){ + sprintf(err,"RG 1, read_2 duplicate count incorrect. Expected %"PRIu64" but got %"PRIu64"\n",exp_rg2_rd2_dups,grp_stats[1][1]->dups); + return err; + } + //GC 1 + if(grp_stats[1][0]->gc!=exp_rg2_rd1_gc){ + sprintf(err,"RG 1, read_1 gc count incorrect. Expected %"PRIu64" but got %"PRIu64"\n",exp_rg2_rd1_gc,grp_stats[1][0]->gc); + return err; + } + //GC 2 + if(grp_stats[1][1]->gc!=exp_rg2_rd2_gc){ + sprintf(err,"RG 1, read_2 gc count incorrect. Expected %"PRIu64" but got %"PRIu64"\n",exp_rg2_rd2_gc,grp_stats[1][1]->gc); + return err; + } + //Unmapped 1 + if(grp_stats[1][0]->umap!=exp_rg2_rd1_umap){ + sprintf(err,"RG 2, read_1 umap count incorrect. Expected %"PRIu64" but got %"PRIu64"\n",exp_rg2_rd1_umap,grp_stats[1][0]->umap); + return err; + } + //Unmapped 2 + if(grp_stats[1][1]->umap!=exp_rg2_rd2_umap){ + sprintf(err,"RG 2, read_2 umap count incorrect. Expected %"PRIu64" but got %"PRIu64"\n",exp_rg2_rd2_umap,grp_stats[1][1]->umap); + return err; + } + //divergent 1 + if(grp_stats[1][0]->divergent!=exp_rg2_rd1_divergent){ + sprintf(err,"RG 2, read_1 divergent count incorrect. Expected %"PRIu64" but got %"PRIu64"\n",exp_rg2_rd1_divergent,grp_stats[1][0]->divergent); + return err; + } + //divergent 2 + if(grp_stats[1][1]->divergent!=exp_rg2_rd2_divergent){ + sprintf(err,"RG 2, read_2 divergent count incorrect. Expected %"PRIu64" but got %"PRIu64"\n",exp_rg2_rd2_divergent,grp_stats[1][1]->divergent); + return err; + } + //mapped_bases 1 + if(grp_stats[1][0]->mapped_bases!=exp_rg2_rd1_mapped_bases){ + sprintf(err,"RG 2, read_1 mapped_bases count incorrect. Expected %"PRIu64" but got %"PRIu64"\n",exp_rg2_rd1_mapped_bases,grp_stats[1][0]->mapped_bases); + return err; + } + //mapped_bases 2 + if(grp_stats[1][1]->mapped_bases!=exp_rg2_rd2_mapped_bases){ + sprintf(err,"RG 2, read_2 mapped_bases count incorrect. Expected %"PRIu64" but got %"PRIu64"\n",exp_rg2_rd2_mapped_bases,grp_stats[1][1]->mapped_bases); + return err; + } + //proper 1 + if(grp_stats[1][0]->proper!=exp_rg2_rd1_proper){ + sprintf(err,"RG 2, read_1 proper count incorrect. Expected %"PRIu64" but got %"PRIu64"\n",exp_rg2_rd1_proper,grp_stats[1][0]->proper); + return err; + } + //proper 2 + if(grp_stats[1][1]->proper!=exp_rg2_rd2_proper){ + sprintf(err,"RG 2, read_2 proper count incorrect. Expected %"PRIu64" but got %"PRIu64"\n",exp_rg2_rd2_proper,grp_stats[1][1]->proper); + return err; + } + return NULL; +} + +char *test_bam_access_process_reads_rna(){ // rna flag in this method includes secondary reads + htsFile *input; + bam_hdr_t *head; + + int grps_size = 0; + stats_rd_t*** grp_stats; + //Open bam file as object + input = hts_open(test_bam,"r"); + if (input == NULL){ + sprintf(err,"Error opening bam file %s\n",test_bam); + return err; + } + //Read header from bam file + head = sam_hdr_read(input); + if (head == NULL){ + sprintf(err,"Error reading header from bam file %s\n",test_bam); + return err; + } + //Run header parsing + rg_info_t **grps = bam_access_parse_header(head, &grps_size, &grp_stats); + if(grps_size != 2){ + sprintf(err,"Didn't read two read groups from test bam: %d\n",grps_size); + return err; + } + int check = bam_access_process_reads(input, head, grps, grps_size, &grp_stats, 0); + if(check!=0){ + sprintf(err,"Error processing reads in bam file, non RNA.\n"); + } + + //Commmon to both RGs + int i=0; + for (i=0;i<2;i++){ + //Read lengths + if(grp_stats[i][0]->length!=exp_rd_length){ + sprintf(err,"Read group %d read_2 length expected %"PRIu32" but got %"PRIu32".\n",(i+1),exp_rd_length,grp_stats[i][0]->length); + return err; + } + if(grp_stats[i][1]->length!=exp_rd_length){ + sprintf(err,"Read group %d read_2 length expected %"PRIu32" but got %"PRIu32".\n",(i+1),exp_rd_length,grp_stats[i][1]->length); + return err; + } + } + + /******RG1 checks*****/ + //Read count 1 + if(grp_stats[0][0]->count!=exp_rg1_rd1_tot_count){ + sprintf(err,"RG 1, read_1 count incorrect. Expected %"PRIu64" but got %"PRIu64"\n",exp_rg1_rd1_tot_count,grp_stats[0][0]->count); + return err; + } + //Read count 2 + if(grp_stats[0][1]->count!=exp_rg1_rd2_tot_count){ + sprintf(err,"RG 1, read_2 count incorrect. Expected %"PRIu64" but got %"PRIu64"\n",exp_rg1_rd2_tot_count,grp_stats[0][1]->count); + return err; + } + //Duplicate reads 1 + if(grp_stats[0][0]->dups!=exp_rg1_rd1_dups){ + sprintf(err,"RG 1, read_1 duplicate count incorrect. Expected %"PRIu64" but got %"PRIu64"\n",exp_rg1_rd1_dups,grp_stats[0][0]->dups); + return err; + } + //Duplicate reads 2 + if(grp_stats[0][1]->dups!=exp_rg1_rd2_dups){ + sprintf(err,"RG 1, read_2 duplicate count incorrect. Expected %"PRIu64" but got %"PRIu64"\n",exp_rg1_rd2_dups,grp_stats[0][1]->dups); + return err; + } + //GC 1 + if(grp_stats[0][0]->gc!=exp_rg1_rd1_gc){ + sprintf(err,"RG 1, read_1 gc count incorrect. Expected %"PRIu64" but got %"PRIu64"\n",exp_rg1_rd1_gc,grp_stats[0][0]->gc); + return err; + } + //GC 2 + if(grp_stats[0][1]->gc!=exp_rg1_rd2_gc){ + sprintf(err,"RG 1, read_2 gc count incorrect. Expected %"PRIu64" but got %"PRIu64"\n",exp_rg1_rd2_gc,grp_stats[0][1]->gc); + return err; + } + //Unmapped 1 + if(grp_stats[0][0]->umap!=exp_rg1_rd1_umap){ + sprintf(err,"RG 1, read_1 umap count incorrect. Expected %"PRIu64" but got %"PRIu64"\n",exp_rg1_rd1_umap,grp_stats[0][0]->umap); + return err; + } + //Unmapped 2 + if(grp_stats[0][1]->umap!=exp_rg1_rd2_umap){ + sprintf(err,"RG 1, read_2 umap count incorrect. Expected %"PRIu64" but got %"PRIu64"\n",exp_rg1_rd2_umap,grp_stats[0][1]->umap); + return err; + } + //divergent 1 + if(grp_stats[0][0]->divergent!=exp_rg1_rd1_divergent){ + sprintf(err,"RG 1, read_1 divergent count incorrect. Expected %"PRIu64" but got %"PRIu64"\n",exp_rg1_rd1_divergent,grp_stats[0][0]->divergent); + return err; + } + //divergent 2 + if(grp_stats[0][1]->divergent!=exp_rg1_rd2_divergent){ + sprintf(err,"RG 1, read_2 divergent count incorrect. Expected %"PRIu64" but got %"PRIu64"\n",exp_rg1_rd2_divergent,grp_stats[0][1]->divergent); + return err; + } + //mapped_bases 1 + if(grp_stats[0][0]->mapped_bases!=exp_rg1_rd1_mapped_bases){ + sprintf(err,"RG 1, read_1 mapped_bases count incorrect. Expected %"PRIu64" but got %"PRIu64"\n",exp_rg1_rd1_mapped_bases,grp_stats[0][0]->mapped_bases); + return err; + } + //mapped_bases 2 + if(grp_stats[0][1]->mapped_bases!=exp_rg1_rd2_mapped_bases){ + sprintf(err,"RG 1, read_2 mapped_bases count incorrect. Expected %"PRIu64" but got %"PRIu64"\n",exp_rg1_rd2_mapped_bases,grp_stats[0][1]->mapped_bases); + return err; + } + //proper 1 + if(grp_stats[0][0]->proper!=exp_rg1_rd1_proper){ + sprintf(err,"RG 1, read_1 proper count incorrect. Expected %"PRIu64" but got %"PRIu64"\n",exp_rg1_rd1_proper,grp_stats[0][0]->proper); + return err; + } + //proper 2 + if(grp_stats[0][1]->proper!=exp_rg1_rd2_proper){ + sprintf(err,"RG 1, read_2 proper count incorrect. Expected %"PRIu64" but got %"PRIu64"\n",exp_rg1_rd2_proper,grp_stats[0][1]->proper); + return err; + } + + + + + /******RG2 checks*****/ + if(grp_stats[1][0]->count!=exp_rg2_rd1_tot_count){ + sprintf(err,"RG 2, read_1 count incorrect. Expected %"PRIu64" but got %"PRIu64"\n",exp_rg2_rd1_tot_count,grp_stats[1][0]->count); + return err; + } + if(grp_stats[1][1]->count!=exp_rg2_rd2_tot_count){ + sprintf(err,"RG 2, read_2 count incorrect. Expected %"PRIu64" but got %"PRIu64"\n",exp_rg2_rd2_tot_count,grp_stats[1][1]->count); + return err; + } + //Duplicate reads 1 + if(grp_stats[1][0]->dups!=exp_rg2_rd1_dups){ + sprintf(err,"RG 1, read_1 duplicate count incorrect. Expected %"PRIu64" but got %"PRIu64"\n",exp_rg2_rd1_dups,grp_stats[1][0]->dups); + return err; + } + //Duplicate reads 2 + if(grp_stats[1][1]->dups!=exp_rg2_rd2_dups){ + sprintf(err,"RG 1, read_2 duplicate count incorrect. Expected %"PRIu64" but got %"PRIu64"\n",exp_rg2_rd2_dups,grp_stats[1][1]->dups); + return err; + } + //GC 1 + if(grp_stats[1][0]->gc!=exp_rg2_rd1_gc){ + sprintf(err,"RG 1, read_1 gc count incorrect. Expected %"PRIu64" but got %"PRIu64"\n",exp_rg2_rd1_gc,grp_stats[1][0]->gc); + return err; + } + //GC 2 + if(grp_stats[1][1]->gc!=exp_rg2_rd2_gc){ + sprintf(err,"RG 1, read_2 gc count incorrect. Expected %"PRIu64" but got %"PRIu64"\n",exp_rg2_rd2_gc,grp_stats[1][1]->gc); + return err; + } + //Unmapped 1 + if(grp_stats[1][0]->umap!=exp_rg2_rd1_umap){ + sprintf(err,"RG 2, read_1 umap count incorrect. Expected %"PRIu64" but got %"PRIu64"\n",exp_rg2_rd1_umap,grp_stats[1][0]->umap); + return err; + } + //Unmapped 2 + if(grp_stats[1][1]->umap!=exp_rg2_rd2_umap){ + sprintf(err,"RG 2, read_2 umap count incorrect. Expected %"PRIu64" but got %"PRIu64"\n",exp_rg2_rd2_umap,grp_stats[1][1]->umap); + return err; + } + //divergent 1 + if(grp_stats[1][0]->divergent!=exp_rg2_rd1_divergent){ + sprintf(err,"RG 2, read_1 divergent count incorrect. Expected %"PRIu64" but got %"PRIu64"\n",exp_rg2_rd1_divergent,grp_stats[1][0]->divergent); + return err; + } + //divergent 2 + if(grp_stats[1][1]->divergent!=exp_rg2_rd2_divergent){ + sprintf(err,"RG 2, read_2 divergent count incorrect. Expected %"PRIu64" but got %"PRIu64"\n",exp_rg2_rd2_divergent,grp_stats[1][1]->divergent); + return err; + } + //mapped_bases 1 + if(grp_stats[1][0]->mapped_bases!=exp_rg2_rd1_mapped_bases){ + sprintf(err,"RG 2, read_1 mapped_bases count incorrect. Expected %"PRIu64" but got %"PRIu64"\n",exp_rg2_rd1_mapped_bases,grp_stats[1][0]->mapped_bases); + return err; + } + //mapped_bases 2 + if(grp_stats[1][1]->mapped_bases!=exp_rg2_rd2_mapped_bases){ + sprintf(err,"RG 2, read_2 mapped_bases count incorrect. Expected %"PRIu64" but got %"PRIu64"\n",exp_rg2_rd2_mapped_bases,grp_stats[1][1]->mapped_bases); + return err; + } + //proper 1 + if(grp_stats[1][0]->proper!=exp_rg2_rd1_proper){ + sprintf(err,"RG 2, read_1 proper count incorrect. Expected %"PRIu64" but got %"PRIu64"\n",exp_rg2_rd1_proper,grp_stats[1][0]->proper); + return err; + } + //proper 2 + if(grp_stats[1][1]->proper!=exp_rg2_rd2_proper){ + sprintf(err,"RG 2, read_2 proper count incorrect. Expected %"PRIu64" but got %"PRIu64"\n",exp_rg2_rd2_proper,grp_stats[1][1]->proper); + return err; + } + + //Run it again but with RNA set () + + + return NULL; +} + + +char *all_tests() { + mu_suite_start(); + mu_run_test(test_bam_access_parse_header); + mu_run_test(test_bam_access_get_mapped_base_count_from_cigar); + mu_run_test(test_bam_access_process_reads_no_rna); + mu_run_test(test_bam_access_process_reads_rna); + return NULL; +} + +RUN_TESTS(all_tests); \ No newline at end of file diff --git a/c/c_tests/bam_stats_calcs_tests.c b/c/c_tests/bam_stats_calcs_tests.c new file mode 100644 index 0000000..3d0fa73 --- /dev/null +++ b/c/c_tests/bam_stats_calcs_tests.c @@ -0,0 +1,73 @@ +/*########LICENCE######### +* PCAP - NGS reference implementations and helper code for the ICGC/TCGA Pan-Cancer Analysis Project +* Copyright (C) 2014-2016 ICGC PanCancer Project +* +* This program is free software; you can redistribute it and/or +* modify it under the terms of the GNU General Public License +* as published by the Free Software Foundation; either version 2 +* of the License, or (at your option) any later version. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with this program; if not see: +* http://www.gnu.org/licenses/gpl-2.0.html +*#########LICENCE#########*/ + +#include +#include "minunit.h" +#include "bam_stats_calcs.h" +#include "khash.h" + +KHASH_MAP_INIT_INT(insert,uint64_t) +//KHASH_INIT2(ins,, khint32_t, uint64_t, 1, kh_int_hash_func, kh_int_hash_equal) + +double exp_mean = 150; +double exp_sd = 50; +double exp_median = 150; +char err[100]; + +char *bam_stats_calcs_calculate_mean_sd_median_insert_size_test(){ + int res; + khash_t(ins) *inserts; + inserts = kh_init(ins); + khint_t k; + k = kh_put(ins,inserts,abs(200),&res); + kh_value(inserts,k) = 50; + k = kh_put(ins,inserts,abs(100),&res); + kh_value(inserts,k) = 50; + + double mean; + double sd; + double median; + + int check = bam_stats_calcs_calculate_mean_sd_median_insert_size(inserts, &mean, &sd, &median); + + if(mean != exp_mean){ + sprintf(err,"Mean from calculation %f is not as expected %f\n",mean,exp_mean); + return err; + } + if(sd != exp_sd){ + sprintf(err,"SD from calculation %f is not as expected %f\n",sd,exp_sd); + return err; + } + if(median != exp_median){ + sprintf(err,"median from calculation %f is not as expected %f\n",median,exp_median); + return err; + } + + + + return NULL; +} + +char *all_tests() { + mu_suite_start(); + mu_run_test(bam_stats_calcs_calculate_mean_sd_median_insert_size_test); + return NULL; +} + +RUN_TESTS(all_tests); \ No newline at end of file diff --git a/c/c_tests/bam_stats_output_tests.c b/c/c_tests/bam_stats_output_tests.c new file mode 100644 index 0000000..1a2c713 --- /dev/null +++ b/c/c_tests/bam_stats_output_tests.c @@ -0,0 +1,190 @@ +/*########LICENCE######### +* PCAP - NGS reference implementations and helper code for the ICGC/TCGA Pan-Cancer Analysis Project +* Copyright (C) 2014-2016 ICGC PanCancer Project +* +* This program is free software; you can redistribute it and/or +* modify it under the terms of the GNU General Public License +* as published by the Free Software Foundation; either version 2 +* of the License, or (at your option) any later version. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with this program; if not see: +* http://www.gnu.org/licenses/gpl-2.0.html +*#########LICENCE#########*/ + +#include +#include "minunit.h" +#include "bam_stats_output.h" + +char err[100]; +char *exp_file = "../t/data/Stats.c.bam.bas"; +char *input_file = "../t/data/Stats.bam"; + +int compare_files(char *file1, char *file2){ + FILE *fp1; + FILE *fp2; + char c1[500], c2[500]; + int cmp; + fp1 = fopen(file1, "r"); + fp2 = fopen(file2, "r"); + + if((fp1 == NULL) || (fp2 == NULL)){ + return -1; + }else{ + while((fgets(c1 , 500, fp1) != NULL) && (fgets(c2 , 500, fp2) != NULL)){ + if((cmp = strcmp(c1, c2)) != 0){ + fprintf(stderr,"Expected: '%s'\nGot: '%s'",c1,c2); + fclose(fp1); + fclose(fp2); + return -1; + } + } + } + fclose(fp1); + fclose(fp2); + return 0; +} + +char *bam_stats_output_print_results_test_file(){ + rg_info_t **grps = NULL; + int grps_size; + stats_rd_t*** grp_stats; + char *output_file = NULL; + int res = bam_stats_output_print_results(grps, grps_size,grp_stats,input_file,output_file); + if(res != -1){ + sprintf(err,"Should have encountered an error for bad output file."); + return err; + } + output_file = "/cantwritehere"; + res = bam_stats_output_print_results(grps, grps_size,grp_stats,input_file,output_file); + if(res != -1){ + sprintf(err,"Should have encountered an error for bad output file."); + return err; + } + + + /****Check for output to file****/ + htsFile *input = NULL; + bam_hdr_t *head = NULL; + //Open bam file as object + input = hts_open(input_file,"r"); + if(input==NULL){ + sprintf(err,"Error opening hts file for reading '%s'.\n",input_file); + return err; + } + + //Read header from bam file + head = sam_hdr_read(input); + if(head == NULL){ + sprintf(err,"Error reading header from opened hts file '%s'.\n",input_file); + return err; + } + + grps = bam_access_parse_header(head, &grps_size, &grp_stats); + if(grps == NULL){ + sprintf(err,"Error fetching read groups from header.\n"); + return err; + } + //Process every read in bam file. + int check = bam_access_process_reads(input,head,grps, grps_size, &grp_stats, 0); + if(check!=0){ + sprintf(err,"Error processing reads in bam file.\n"); + return err; + } + output_file = "../t/data/test_out.bam.bas"; + res = bam_stats_output_print_results(grps, grps_size,grp_stats,input_file,output_file); + + //Check test file is equal to expected + if(compare_files(exp_file,output_file) != 0){ + sprintf(err,"Two files expected %s && got %s were not equal in content\n",exp_file,output_file); + } + + //Delete test file + int un = unlink(output_file); + if(un != 0){ + sprintf(err,"Failed to delete tmp output file %s.\n",output_file); + } + return NULL; +} + +char *bam_stats_output_print_results_test_stdout(){ + rg_info_t **grps = NULL; + int grps_size; + stats_rd_t*** grp_stats; + char *output_file = NULL; + int res = bam_stats_output_print_results(grps, grps_size,grp_stats,input_file,output_file); + if(res != -1){ + sprintf(err,"Should have encountered an error for bad output file."); + return err; + } + output_file = "/cantwritehere"; + res = bam_stats_output_print_results(grps, grps_size,grp_stats,input_file,output_file); + if(res != -1){ + sprintf(err,"Should have encountered an error for bad output file."); + return err; + } + + + /****Check for output to file****/ + htsFile *input = NULL; + bam_hdr_t *head = NULL; + //Open bam file as object + input = hts_open(input_file,"r"); + if(input==NULL){ + sprintf(err,"Error opening hts file for reading '%s'.\n",input_file); + return err; + } + + //Read header from bam file + head = sam_hdr_read(input); + if(head == NULL){ + sprintf(err,"Error reading header from opened hts file '%s'.\n",input_file); + return err; + } + + grps = bam_access_parse_header(head, &grps_size, &grp_stats); + if(grps == NULL){ + sprintf(err,"Error fetching read groups from header.\n"); + return err; + } + //Process every read in bam file. + int check = bam_access_process_reads(input,head,grps, grps_size, &grp_stats, 0); + if(check!=0){ + sprintf(err,"Error processing reads in bam file.\n"); + return err; + } + output_file = "../t/data/test_out.bam.bas"; + + /****Check for output to stdout****/ + //redirect stdout to file + FILE *frp = freopen(output_file, "w", stdout); + if(frp == NULL){ + sprintf(err,"Error reassigning stdout to file %s\n",output_file); + } + res = bam_stats_output_print_results(grps, grps_size,grp_stats,input_file,output_file); + frp = freopen("/dev/stdout", "w", stdout); + //Check test file is equal to expected + if(compare_files(exp_file,output_file) != 0){ + sprintf(err,"Two files expected %s && got %s were not equal in content\n",exp_file,output_file); + } + //Delete test file + int un = unlink(output_file); + if(un != 0){ + sprintf(err,"Failed to delete tmp output file %s.\n",output_file); + } + return NULL; +} + +char *all_tests() { + mu_suite_start(); + mu_run_test(bam_stats_output_print_results_test_file); + mu_run_test(bam_stats_output_print_results_test_stdout); + return NULL; +} + +RUN_TESTS(all_tests); \ No newline at end of file diff --git a/c/c_tests/minunit.h b/c/c_tests/minunit.h new file mode 100644 index 0000000..fca3c98 --- /dev/null +++ b/c/c_tests/minunit.h @@ -0,0 +1,52 @@ +/*########LICENCE######### +* PCAP - NGS reference implementations and helper code for the ICGC/TCGA Pan-Cancer Analysis Project +* Copyright (C) 2014,2015,2016 ICGC PanCancer Project +* +* This program is free software; you can redistribute it and/or +* modify it under the terms of the GNU General Public License +* as published by the Free Software Foundation; either version 2 +* of the License, or (at your option) any later version. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with this program; if not see: +* http://www.gnu.org/licenses/gpl-2.0.html +*#########LICENCE#########*/ + +#undef NDEBUG +#ifndef _minunit_h +#define _minunit_h + +#include +#include +#include + +#define mu_suite_start() char *message = NULL + +#define mu_assert(test, message) if (!(test)) { log_err(message); return message; } +#define mu_run_test(test) debug("\n-----%s", " " #test); \ + message = test(); tests_run++; if (message) return message; + +#define RUN_TESTS(name) int main(int argc, char *argv[]) {\ + argc = 1; \ + debug("----- RUNNING: %s", argv[0]);\ + printf("----\nRUNNING: %s\n", argv[0]);\ + char *result = name();\ + if (result != 0) {\ + printf("FAILED: %s\n", result);\ + }\ + else {\ + printf("ALL TESTS PASSED\n");\ + }\ + printf("Tests run: %d\n", tests_run);\ + exit(result != 0);\ +} + + +int tests_run; + +#endif diff --git a/c/c_tests/runtests.sh b/c/c_tests/runtests.sh new file mode 100644 index 0000000..262b526 --- /dev/null +++ b/c/c_tests/runtests.sh @@ -0,0 +1,38 @@ +##########LICENCE########## +# PCAP - NGS reference implementations and helper code for the ICGC/TCGA Pan-Cancer Analysis Project +# Copyright (C) 2014,2015 ICGC PanCancer Project +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not see: +# http://www.gnu.org/licenses/gpl-2.0.html +##########LICENCE########## + +echo "Running unit tests:" + +for i in c_tests/*_tests +do + if test -f $i + then + if $VALGRIND ./$i 2>> c_tests/tests_log + then + echo $i PASS + else + echo "ERROR in test $i: here's tests/tests_log" + echo "------" + tail c_tests/tests_log + exit 1 + fi + fi +done + +echo "" diff --git a/docs.tar.gz b/docs.tar.gz index 2d3f7d5..65a8f77 100644 Binary files a/docs.tar.gz and b/docs.tar.gz differ diff --git a/lib/PCAP.pm b/lib/PCAP.pm index dd1718c..b5a1ee6 100644 --- a/lib/PCAP.pm +++ b/lib/PCAP.pm @@ -2,7 +2,7 @@ package PCAP; ##########LICENCE########## # PCAP - NGS reference implementations and helper code for the ICGC/TCGA Pan-Cancer Analysis Project -# Copyright (C) 2014,2015 ICGC PanCancer Project +# Copyright (C) 2014-2016 ICGC PanCancer Project # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License @@ -24,12 +24,12 @@ use strict; use Const::Fast qw(const); use base 'Exporter'; -our $VERSION = '1.12.3'; +our $VERSION = '1.13.0'; our @EXPORT = qw($VERSION); const my $LICENSE => "################# -# PCAP version %s, Copyright (C) 2014-2015 ICGC/TCGA Pan-Cancer Analysis Project +# PCAP version %s, Copyright (C) 2014-2016 ICGC/TCGA Pan-Cancer Analysis Project # PCAP comes with ABSOLUTELY NO WARRANTY # See LICENSE for full details. #################"; @@ -77,6 +77,8 @@ const my %UPGRADE_PATH => ( '0.1.0' => 'biobambam,bwa,samtools', '1.12.0' => '', '1.12.1' => '', '1.12.2' => '', + '1.12.3' => '', + '1.13.0' => '', ); sub license { diff --git a/t/data/Stats.c.bam.bas b/t/data/Stats.c.bam.bas new file mode 100644 index 0000000..5120eac --- /dev/null +++ b/t/data/Stats.c.bam.bas @@ -0,0 +1,3 @@ +bam_filename sample platform platform_unit library readgroup read_length_r1 read_length_r2 #_mapped_bases #_mapped_bases_r1 #_mapped_bases_r2 #_divergent_bases #_divergent_bases_r1 #_divergent_bases_r2 #_total_reads #_total_reads_r1 #_total_reads_r2 #_mapped_reads #_mapped_reads_r1 #_mapped_reads_r2 #_mapped_reads_properly_paired #_gc_bases_r1 #_gc_bases_r2 mean_insert_size insert_size_sd median_insert_size #_duplicate_reads +Stats.bam PD1234a GAII 5178_6 PD1234a 140546_1054 29976 20 20 152 115 37 48 30 18 10 7 3 8 6 2 6 63 27 195.833 119.387 125.000 4 +Stats.bam PD1234a GAII 5085_6 PD1234a 140546_1054 29978 20 20 40 20 20 23 11 12 6 3 3 2 1 1 1 27 27 100.000 0.000 100.000 0