From b97bd571bd64dd92aa1ad0a65010acf8ec389d97 Mon Sep 17 00:00:00 2001 From: Yan Gao Date: Wed, 4 Aug 2021 21:54:39 +0800 Subject: [PATCH] v1.5.0 --- .gitignore | 1 + README.md | 64 +++++++++++++++++++++++++++++++++++++++--------- src/abpoa_cons.c | 35 +++++++++++++------------- src/main.c | 5 ++-- 4 files changed, 73 insertions(+), 32 deletions(-) diff --git a/.gitignore b/.gitignore index 8034603..ea86fd4 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ +README.html # simulation file simu/pbsim/* simu/* diff --git a/README.md b/README.md index 37d7b30..7a027ce 100644 --- a/README.md +++ b/README.md @@ -10,15 +10,15 @@ [![GitHub Downloads](https://img.shields.io/github/downloads/yangao07/TideHunter/total.svg?style=social&logo=github&label=Download)](https://github.com/yangao07/TideHunter/releases) --> -## Updates (v1.4.4) -* Skip N bases in the input sequences +## Updates (v1.5.0) +* Output fastq format ## Getting started Download the [latest release](https://github.com/yangao07/TideHunter/releases): ``` -wget https://github.com/yangao07/TideHunter/releases/download/v1.4.4/TideHunter-v1.4.4.tar.gz -tar -zxvf TideHunter-v1.4.4.tar.gz && cd TideHunter-v1.4.4 +wget https://github.com/yangao07/TideHunter/releases/download/v1.5.0/TideHunter-v1.5.0.tar.gz +tar -zxvf TideHunter-v1.5.0.tar.gz && cd TideHunter-v1.5.0 ``` Make from source and run with test data: ``` @@ -38,8 +38,9 @@ TideHunter ./test_data/test_50x4.fa > cons.fa - [Pre-built binary executable file for Linux/Unix](#binary) - [Getting started with toy example in `test_data`](#start) - [Usage](#usage) - - [To generate consensus sequencesin FASTA format](#fasta_cons) - - [To generate consensus sequencesin tabular format](#tab_cons) + - [To generate consensus sequences in FASTA format](#fasta_cons) + - [To generate consensus sequences in tabular format](#tab_cons) + - [To generate consensus sequences in FASTQ format](#fq_cons) - [To generate full-length consensus sequences](#full_cons) - [To generate unit sequences in FASTA format](#fasta_unit) - [To generate unit sequences in tabular format](#tab_unit) @@ -49,6 +50,7 @@ TideHunter ./test_data/test_50x4.fa > cons.fa - [Output](#output) - [Tabular format](#tabular) - [FASTA format](#fasta) + - [FASTQ format](#fastq) - [Unit sequences](#unit) - [Contact](#contact) @@ -76,9 +78,9 @@ Make sure you have gcc (>=6.4.0) and zlib installed before compiling. It is recommended to download the latest release of TideHunter from the [release page](https://github.com/yangao07/TideHunter/releases). ``` -wget https://github.com/yangao07/TideHunter/releases/download/v1.4.4/TideHunter-v1.4.4.tar.gz -tar -zxvf TideHunter-v1.4.4.tar.gz -cd TideHunter-v1.4.4; make +wget https://github.com/yangao07/TideHunter/releases/download/v1.5.0/TideHunter-v1.5.0.tar.gz +tar -zxvf TideHunter-v1.5.0.tar.gz +cd TideHunter-v1.5.0; make ``` Or, you can use `git clone` command to download the source code. Don't forget to include the `--recursive` to download the codes of [abPOA](https://github.com/yangao07/abPOA). @@ -91,8 +93,8 @@ cd TideHunter; make ### Pre-built binary executable file for Linux/Unix If you meet any compiling issue, please try the pre-built binary file: ``` -wget https://github.com/yangao07/TideHunter/releases/download/v1.4.4/TideHunter-v1.4.4_x64-linux.tar.gz -tar -zxvf TideHunter-v1.4.4_x64-linux.tar.gz +wget https://github.com/yangao07/TideHunter/releases/download/v1.5.0/TideHunter-v1.5.0_x64-linux.tar.gz +tar -zxvf TideHunter-v1.5.0_x64-linux.tar.gz ``` ## Getting started with toy example in `test_data` @@ -109,6 +111,10 @@ TideHunter ./test_data/test_1000x10.fa > cons.fa ``` TideHunter -f 2 ./test_data/test_1000x10.fa > cons.out ``` +#### To generate consensus sequences in FASTQ format +``` +TideHunter -f 3 ./test_data/test_1000x10.fa > cons.fq +``` #### To generate full-length consensus sequences ``` TideHunter -5 ./test_data/5prime.fa -3 ./test_data/3prime.fa ./test_data/full_length.fa > cons_full.fa @@ -150,13 +156,18 @@ Options: -a --ada-mat-rat FLT minimum match ratio of adapter sequence [0.80] Output: -o --output STR output file [stdout] - -m --min-len [INT] only output consensus sequence with min. length of [30] + -m --min-len INT only output consensus sequence with min. length of [30] + -r --min-cov FLOAT|INT only output consensus sequence with at least R supporting units for all bases: [0.00] + if r is fraction: R = r * total copy number + if r is integer: R = r -u --unit-seq only output unit sequences of each tandem repeat, no consensus sequence [False] -l --longest only output consensus sequence of tandem repeat that covers the longest read sequence [False] -F --full-len only output full-length consensus sequence [False] -f --out-fmt INT output format [1] - 1: FASTA - 2: Tabular + - 3: FASTQ + qualiy score of each base represents the ratio of the consensus coverage to the # total copies. Computing resource: -t --thread INT number of threads to use [4] @@ -222,6 +233,35 @@ The read name and comment of each consensus sequence have the following format: >readName_repN_copyNum readLen_start_end_consLen_aveMatch_fullLen_subPos ``` +### FASTQ format +For FASTQ output format, the read name and comment are the same as described in [FASTA format](#fasta). +TideHunter calculated a customized Phred score as the base quality score of each consensus base: + +

+ + +

+ +Here, is the Sigmoid-smoothed consensus calling error rate for each base: + +

+ + +

+ + is the Sigmoid function: +

+ +

+ + is the coverage of the consensus base and + is the number of total copies. +For example, if one base of the consensus sequence has 4 supporting copies and the total copy number is 5, + is 4 and is 5. + +The Phred quality score was then shifted by 33 and converted to characters based on the ASCII value. +The quality scores range from 0 to 60 and the corresponding ASCII values range from 33 to 93. + ### Unit sequences TideHunter can output the unit sequences without performing the consensus calling step when option `-u/--unit-seq` is enabled. Then, only the following information will be output for the tabular format: diff --git a/src/abpoa_cons.c b/src/abpoa_cons.c index 2a2139d..71c6bfc 100644 --- a/src/abpoa_cons.c +++ b/src/abpoa_cons.c @@ -7,9 +7,11 @@ #include "seq.h" #include "abpoa.h" +#define NAT_E 2.718281828459045 + abpoa_para_t *mt_abpoa_init_para(mini_tandem_para *mtp) { abpoa_para_t *abpt = abpoa_init_para(); - // abpt->cons_agrm = 1; // 0: HB, 1: RC + abpt->cons_agrm = 1; // 0: HB, 1: RC abpt->match = mtp->match; // match score abpt->mismatch = mtp->mismatch; // mismatch penalty abpt->gap_open1 = mtp->gap_open1; // first gap open penalty @@ -26,7 +28,7 @@ abpoa_para_t *mt_abpoa_init_para(mini_tandem_para *mtp) { } int abpoa_gen_cons(abpoa_t *ab, abpoa_para_t *abpt, uint8_t *bseqs, int seq_len, int *pos, int pos_n, uint8_t *cons_bseq, uint8_t *cons_qual, int min_cov) { - int i, seq_n, cons_len = 0; + int i, n_seqs, cons_len = 0; /* clean graph if it is re-used */ abpoa_reset_graph(ab, abpt, seq_len); @@ -35,31 +37,28 @@ int abpoa_gen_cons(abpoa_t *ab, abpoa_para_t *abpt, uint8_t *bseqs, int seq_len, uint8_t **_bseqs = (uint8_t**)malloc(sizeof(uint8_t*) * (pos_n-1)); /* main graph alignment */ // |pos|-----|pos|-----pos| - for (i = seq_n = 0; i < pos_n-1; ++i) { + for (i = n_seqs = 0; i < pos_n-1; ++i) { int start = pos[i], end = pos[i+1]; if (start < 0 || end < 0 || start >= seq_len || end+1 >= seq_len) continue; // fprintf(stdout, ">%d\n", start); - seq_lens[seq_n] = end - start; - _bseqs[seq_n] = bseqs + start + 1; - /*int j; - for (j = start; j < end; ++j) - fprintf(stdout, "%c", "ACGT"[bseqs[j+1]]); - fprintf(stdout, "\n");*/ - ++seq_n; + seq_lens[n_seqs] = end - start; + _bseqs[n_seqs] = bseqs + start + 1; + /*int j; for (j = start; j < end; ++j) fprintf(stdout, "%c", "ACGT"[bseqs[j+1]]); fprintf(stdout, "\n");*/ + ++n_seqs; } #ifdef __DEBUG__ FILE *outfp = stderr; #else FILE *outfp = NULL; #endif - if (seq_n <= 2) { - if (seq_n == 0) err_fatal_simple("No enough sequences to perform msa.\n"); + if (n_seqs <= 2) { + if (n_seqs == 0) err_fatal_simple("No enough sequences to perform msa.\n"); cons_len = seq_lens[0]; for (i = 0; i < cons_len; ++i) cons_bseq[i] = _bseqs[0][i]; } else { uint8_t **_cons_bseq; int **_cons_cov, *_cons_l, _cons_n = 0; - if (min_cov > 0 || cons_qual != NULL) abpoa_msa(ab, abpt, seq_n, NULL, seq_lens, _bseqs, outfp, &_cons_bseq, &_cons_cov, &_cons_l, &_cons_n, NULL, NULL); - else abpoa_msa(ab, abpt, seq_n, NULL, seq_lens, _bseqs, outfp, &_cons_bseq, NULL, &_cons_l, &_cons_n, NULL, NULL); + if (min_cov > 0 || cons_qual != NULL) abpoa_msa(ab, abpt, n_seqs, NULL, seq_lens, _bseqs, outfp, &_cons_bseq, &_cons_cov, &_cons_l, &_cons_n, NULL, NULL); + else abpoa_msa(ab, abpt, n_seqs, NULL, seq_lens, _bseqs, outfp, &_cons_bseq, NULL, &_cons_l, &_cons_n, NULL, NULL); if (_cons_n == 1) { cons_len = _cons_l[0]; int skip = 0; @@ -72,10 +71,12 @@ int abpoa_gen_cons(abpoa_t *ab, abpoa_para_t *abpt, uint8_t *bseqs, int seq_len, } } if (cons_qual != NULL) { - int phred; + int phred; double x, p; for (i = 0; i < cons_len; ++i) { - if (_cons_cov[0][i] == seq_n) phred = 73; - else phred = 33 + (int)(-10 * log10((seq_n-_cons_cov[0][i]+0.0) / seq_n)); + // min: 0+33=33, max: 60+33=93 + x = 13.8 * (1.25 * _cons_cov[0][i] / n_seqs - 0.25); + p = 1 - 1.0 / (1.0 + pow(NAT_E, -1 * x)); + phred = 33 + (int)(-10 * log10(p) + 0.499); cons_qual[i] = phred; } } diff --git a/src/main.c b/src/main.c index 0c7f79b..840e425 100644 --- a/src/main.c +++ b/src/main.c @@ -10,7 +10,7 @@ #include "kseq.h" const char PROG[20] = "TideHunter"; -const char VERSION[20] = "1.4.4"; +const char VERSION[20] = "1.5.0"; const char CONTACT[30] = "gaoy286@mail.sysu.edu.cn"; const struct option mini_tandem_opt [] = { @@ -114,7 +114,7 @@ static int usage(void) err_printf(" -o --output STR output file [stdout]\n"); err_printf(" -m --min-len INT only output consensus sequence with min. length of [%d]\n", DEF_MIN_LEN); err_printf(" -r --min-cov FLOAT|INT only output consensus sequence with at least \e[4mR\e[0m supporting units for all bases: [%.2f]\n", DEF_MIN_COV); - err_printf(" if \e[4mr\e[0m is fraction: \e[4mR\e[0m = \e[4mr\e[0m * copy number\n"); + err_printf(" if \e[4mr\e[0m is fraction: \e[4mR\e[0m = \e[4mr\e[0m * total copy number\n"); err_printf(" if \e[4mr\e[0m is integer: \e[4mR\e[0m = \e[4mr\e[0m\n"); err_printf(" -u --unit-seq only output unit sequences of each tandem repeat, no consensus sequence [False]\n"); err_printf(" -l --longest only output consensus sequence of tandem repeat that covers the longest read sequence [False]\n"); @@ -431,7 +431,6 @@ int mini_tandem(const char *read_fn, mini_tandem_para *mtp) return 0; } -// TODO add score para int main(int argc, char *argv[]) { mini_tandem_para *mtp = mini_tandem_init_para();