v1.5.0

yangao07 · Aug 4, 2021 · b97bd57 · b97bd57
1 parent b388c42
commit b97bd57
Show file tree

Hide file tree

Showing 4 changed files with 73 additions and 32 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,4 @@
+README.html
 # simulation file
 simu/pbsim/*
 simu/*

diff --git a/README.md b/README.md
@@ -10,15 +10,15 @@
 [![GitHub Downloads](https://img.shields.io/github/downloads/yangao07/TideHunter/total.svg?style=social&logo=github&label=Download)](https://github.com/yangao07/TideHunter/releases)
 -->
 
-## Updates (v1.4.4)
-* Skip N bases in the input sequences
+## Updates (v1.5.0)
+* Output fastq format
 
 
 ## Getting started
 Download the [latest release](https://github.com/yangao07/TideHunter/releases):
 ```
-wget https://github.com/yangao07/TideHunter/releases/download/v1.4.4/TideHunter-v1.4.4.tar.gz
-tar -zxvf TideHunter-v1.4.4.tar.gz && cd TideHunter-v1.4.4
+wget https://github.com/yangao07/TideHunter/releases/download/v1.5.0/TideHunter-v1.5.0.tar.gz
+tar -zxvf TideHunter-v1.5.0.tar.gz && cd TideHunter-v1.5.0
 ```
 Make from source and run with test data:
 ```
@@ -38,8 +38,9 @@ TideHunter ./test_data/test_50x4.fa > cons.fa
   - [Pre-built binary executable file for Linux/Unix](#binary)
 - [Getting started with toy example in `test_data`](#start)
 - [Usage](#usage)
-  - [To generate consensus sequencesin FASTA format](#fasta_cons)
-  - [To generate consensus sequencesin tabular format](#tab_cons)
+  - [To generate consensus sequences in FASTA format](#fasta_cons)
+  - [To generate consensus sequences in tabular format](#tab_cons)
+  - [To generate consensus sequences in FASTQ format](#fq_cons)
   - [To generate full-length consensus sequences](#full_cons)
   - [To generate unit sequences in FASTA format](#fasta_unit)
   - [To generate unit sequences in tabular format](#tab_unit)
@@ -49,6 +50,7 @@ TideHunter ./test_data/test_50x4.fa > cons.fa
 - [Output](#output)
   - [Tabular format](#tabular)
   - [FASTA format](#fasta)
+  - [FASTQ format](#fastq)
   - [Unit sequences](#unit)
 - [Contact](#contact)
 
@@ -76,9 +78,9 @@ Make sure you have gcc (>=6.4.0) and zlib installed before compiling.
 It is recommended to download the latest release of TideHunter 
 from the [release page](https://github.com/yangao07/TideHunter/releases).
 ```
-wget https://github.com/yangao07/TideHunter/releases/download/v1.4.4/TideHunter-v1.4.4.tar.gz
-tar -zxvf TideHunter-v1.4.4.tar.gz
-cd TideHunter-v1.4.4; make
+wget https://github.com/yangao07/TideHunter/releases/download/v1.5.0/TideHunter-v1.5.0.tar.gz
+tar -zxvf TideHunter-v1.5.0.tar.gz
+cd TideHunter-v1.5.0; make
 ```
 Or, you can use `git clone` command to download the source code. 
 Don't forget to include the `--recursive` to download the codes of [abPOA](https://github.com/yangao07/abPOA).
@@ -91,8 +93,8 @@ cd TideHunter; make
 ### <a name="binary"></a>Pre-built binary executable file for Linux/Unix 
 If you meet any compiling issue, please try the pre-built binary file:
 ```
-wget https://github.com/yangao07/TideHunter/releases/download/v1.4.4/TideHunter-v1.4.4_x64-linux.tar.gz
-tar -zxvf TideHunter-v1.4.4_x64-linux.tar.gz
+wget https://github.com/yangao07/TideHunter/releases/download/v1.5.0/TideHunter-v1.5.0_x64-linux.tar.gz
+tar -zxvf TideHunter-v1.5.0_x64-linux.tar.gz
 ```
 
 ## <a name="start"></a>Getting started with toy example in `test_data`
@@ -109,6 +111,10 @@ TideHunter ./test_data/test_1000x10.fa > cons.fa
 ```
 TideHunter -f 2 ./test_data/test_1000x10.fa > cons.out
 ```
+#### <a name="fq_cons"></a>To generate consensus sequences in FASTQ format
+```
+TideHunter -f 3 ./test_data/test_1000x10.fa > cons.fq
+```
 #### <a name="full_cons"></a>To generate full-length consensus sequences
 ```
 TideHunter -5 ./test_data/5prime.fa -3 ./test_data/3prime.fa ./test_data/full_length.fa > cons_full.fa
@@ -150,13 +156,18 @@ Options:
     -a --ada-mat-rat FLT    minimum match ratio of adapter sequence [0.80]
   Output:
     -o --output      STR    output file [stdout]
-    -m --min-len     [INT]  only output consensus sequence with min. length of [30]
+    -m --min-len     INT    only output consensus sequence with min. length of [30]
+    -r --min-cov  FLOAT|INT only output consensus sequence with at least R supporting units for all bases: [0.00]
+                            if r is fraction: R = r * total copy number
+                            if r is integer: R = r
     -u --unit-seq           only output unit sequences of each tandem repeat, no consensus sequence [False]
     -l --longest            only output consensus sequence of tandem repeat that covers the longest read sequence [False]
     -F --full-len           only output full-length consensus sequence [False]
     -f --out-fmt     INT    output format [1]
                             - 1: FASTA
                             - 2: Tabular
+                            - 3: FASTQ
+                              qualiy score of each base represents the ratio of the consensus coverage to the # total copies.
   Computing resource:
     -t --thread      INT    number of threads to use [4]
 
@@ -222,6 +233,35 @@ The read name and comment of each consensus sequence have the following format:
 >readName_repN_copyNum readLen_start_end_consLen_aveMatch_fullLen_subPos
 ```
 
+### <a name="fastq"></a>FASTQ format
+For FASTQ output format, the read name and comment are the same as described in [FASTA format](#fasta).
+TideHunter calculated a customized Phred score as the base quality score of each consensus base:
+
+<p align="center">
+  <img src="https://latex.codecogs.com/svg.image?Q_{phred}=-10 \cdot log_{10}(p)"/>
+<!-- <img src="https://render.githubusercontent.com/render/math?math=Q_{phred}=-10 \cdot log_{10}(p)"> -->
+</p>
+
+Here, <img src="https://latex.codecogs.com/svg.image?p"/> is the Sigmoid-smoothed consensus calling error rate for each base:
+
+<p align="center">
+  <!-- <img src="https://render.githubusercontent.com/render/math?math=p=1-S(N_{cons} / N_{total} \cdot 21)">, -->
+  <img src="https://latex.codecogs.com/svg.image?p=1-S(13.8 \cdot (1.25 \cdot N_{cons} / N_{total} - 0.25))"/>
+</p>
+
+<img src="https://latex.codecogs.com/svg.image?S"/> is the Sigmoid function:
+<p align="center">
+  <img src="https://latex.codecogs.com/svg.image?S(x)=\frac{1}{1+e^{-x}}"/>
+</p>
+
+<img src="https://latex.codecogs.com/svg.image?N_{cons}"/> is the coverage of the consensus base and
+<img src="https://latex.codecogs.com/svg.image?N_{total}"/> is the number of total copies. 
+For example, if one base of the consensus sequence has 4 supporting copies and the total copy number is 5,
+<img src="https://latex.codecogs.com/svg.image?N_{cons}"/> is 4 and <img src="https://latex.codecogs.com/svg.image?N_{total}"/> is 5.
+
+The Phred quality score was then shifted by 33 and converted to characters based on the ASCII value.
+The quality scores range from 0 to 60 and the corresponding ASCII values range from 33 to 93.
+
 ### <a name="unit"></a>Unit sequences
 TideHunter can output the unit sequences without performing the consensus calling step when option `-u/--unit-seq` is enabled. Then, only the following information will be output for the tabular format:
 

diff --git a/src/abpoa_cons.c b/src/abpoa_cons.c
@@ -7,9 +7,11 @@
 #include "seq.h"
 #include "abpoa.h"
 
+#define NAT_E 2.718281828459045
+
 abpoa_para_t *mt_abpoa_init_para(mini_tandem_para *mtp) {
     abpoa_para_t *abpt = abpoa_init_para();
-    // abpt->cons_agrm = 1; // 0: HB, 1: RC
+    abpt->cons_agrm = 1; // 0: HB, 1: RC
     abpt->match = mtp->match;     // match score
     abpt->mismatch = mtp->mismatch;  // mismatch penalty
     abpt->gap_open1 = mtp->gap_open1; // first gap open penalty
@@ -26,7 +28,7 @@ abpoa_para_t *mt_abpoa_init_para(mini_tandem_para *mtp) {
 }
 
 int abpoa_gen_cons(abpoa_t *ab, abpoa_para_t *abpt, uint8_t *bseqs, int seq_len, int *pos, int pos_n, uint8_t *cons_bseq, uint8_t *cons_qual, int min_cov) {
-    int i, seq_n, cons_len = 0;
+    int i, n_seqs, cons_len = 0;
 
     /* clean graph if it is re-used */
     abpoa_reset_graph(ab, abpt, seq_len);
@@ -35,31 +37,28 @@ int abpoa_gen_cons(abpoa_t *ab, abpoa_para_t *abpt, uint8_t *bseqs, int seq_len,
     uint8_t **_bseqs = (uint8_t**)malloc(sizeof(uint8_t*) * (pos_n-1));
     /* main graph alignment */
     // |pos|-----|pos|-----pos|
-    for (i = seq_n = 0; i < pos_n-1; ++i) {
+    for (i = n_seqs = 0; i < pos_n-1; ++i) {
         int start = pos[i], end = pos[i+1];
         if (start < 0 || end < 0 || start >= seq_len || end+1 >= seq_len) continue;
         // fprintf(stdout, ">%d\n", start);
-        seq_lens[seq_n] = end - start;
-        _bseqs[seq_n] = bseqs + start + 1;
-        /*int j;
-        for (j = start; j < end; ++j)
-            fprintf(stdout, "%c", "ACGT"[bseqs[j+1]]);
-        fprintf(stdout, "\n");*/
-        ++seq_n;
+        seq_lens[n_seqs] = end - start;
+        _bseqs[n_seqs] = bseqs + start + 1;
+        /*int j; for (j = start; j < end; ++j) fprintf(stdout, "%c", "ACGT"[bseqs[j+1]]); fprintf(stdout, "\n");*/
+        ++n_seqs;
     }
 #ifdef __DEBUG__
     FILE *outfp = stderr;
 #else
     FILE *outfp = NULL;
 #endif
-    if (seq_n <= 2) {
-        if (seq_n == 0) err_fatal_simple("No enough sequences to perform msa.\n");
+    if (n_seqs <= 2) {
+        if (n_seqs == 0) err_fatal_simple("No enough sequences to perform msa.\n");
         cons_len = seq_lens[0];
         for (i = 0; i < cons_len; ++i) cons_bseq[i] = _bseqs[0][i];
     } else {
         uint8_t **_cons_bseq; int **_cons_cov, *_cons_l, _cons_n = 0;
-        if (min_cov > 0 || cons_qual != NULL) abpoa_msa(ab, abpt, seq_n, NULL, seq_lens, _bseqs, outfp, &_cons_bseq, &_cons_cov, &_cons_l, &_cons_n, NULL, NULL);
-        else abpoa_msa(ab, abpt, seq_n, NULL, seq_lens, _bseqs, outfp, &_cons_bseq, NULL, &_cons_l, &_cons_n, NULL, NULL);
+        if (min_cov > 0 || cons_qual != NULL) abpoa_msa(ab, abpt, n_seqs, NULL, seq_lens, _bseqs, outfp, &_cons_bseq, &_cons_cov, &_cons_l, &_cons_n, NULL, NULL);
+        else abpoa_msa(ab, abpt, n_seqs, NULL, seq_lens, _bseqs, outfp, &_cons_bseq, NULL, &_cons_l, &_cons_n, NULL, NULL);
         if (_cons_n == 1) {
             cons_len = _cons_l[0];
             int skip = 0;
@@ -72,10 +71,12 @@ int abpoa_gen_cons(abpoa_t *ab, abpoa_para_t *abpt, uint8_t *bseqs, int seq_len,
                     }
                 }
                 if (cons_qual != NULL) {
-                    int phred;
+                    int phred; double x, p;
                     for (i = 0; i < cons_len; ++i) {
-                        if (_cons_cov[0][i] == seq_n) phred = 73;
-                        else phred = 33 + (int)(-10 * log10((seq_n-_cons_cov[0][i]+0.0) / seq_n));
+                        // min: 0+33=33, max: 60+33=93
+                        x = 13.8 * (1.25 * _cons_cov[0][i] / n_seqs - 0.25);
+                        p = 1 - 1.0 / (1.0 + pow(NAT_E, -1 * x));
+                        phred = 33 + (int)(-10 * log10(p) + 0.499);
                         cons_qual[i] = phred;
                     }
                 }

diff --git a/src/main.c b/src/main.c
@@ -10,7 +10,7 @@
 #include "kseq.h"
 
 const char PROG[20] = "TideHunter";
-const char VERSION[20] = "1.4.4";
+const char VERSION[20] = "1.5.0";
 const char CONTACT[30] = "gaoy286@mail.sysu.edu.cn";
 
 const struct option mini_tandem_opt [] = {
@@ -114,7 +114,7 @@ static int usage(void)
 	err_printf("    -o --output      STR    output file [stdout]\n");
 	err_printf("    -m --min-len     INT    only output consensus sequence with min. length of [%d]\n", DEF_MIN_LEN);
     err_printf("    -r --min-cov  FLOAT|INT only output consensus sequence with at least \e[4mR\e[0m supporting units for all bases: [%.2f]\n", DEF_MIN_COV);
-    err_printf("                            if \e[4mr\e[0m is fraction: \e[4mR\e[0m = \e[4mr\e[0m * copy number\n");
+    err_printf("                            if \e[4mr\e[0m is fraction: \e[4mR\e[0m = \e[4mr\e[0m * total copy number\n");
     err_printf("                            if \e[4mr\e[0m is integer: \e[4mR\e[0m = \e[4mr\e[0m\n");
     err_printf("    -u --unit-seq           only output unit sequences of each tandem repeat, no consensus sequence [False]\n");
 	err_printf("    -l --longest            only output consensus sequence of tandem repeat that covers the longest read sequence [False]\n");
@@ -431,7 +431,6 @@ int mini_tandem(const char *read_fn, mini_tandem_para *mtp)
 	return 0;
 }
 
-// TODO add score para 
 int main(int argc, char *argv[])
 {
 	mini_tandem_para *mtp = mini_tandem_init_para();