diff --git a/main.c b/main.c index 34b49e0c..f7b0a65a 100644 --- a/main.c +++ b/main.c @@ -6,7 +6,7 @@ #include "mmpriv.h" #include "getopt.h" -#define MM_VERSION "2.7-r670-dirty" +#define MM_VERSION "2.7-r671-dirty" #ifdef __linux__ #include @@ -28,7 +28,7 @@ static struct option long_options[] = { { "seed", required_argument, 0, 0 }, { "no-kalloc", no_argument, 0, 0 }, { "print-qname", no_argument, 0, 0 }, - { "no-self", no_argument, 0, 0 }, + { "no-self", no_argument, 0, 'D' }, { "print-seeds", no_argument, 0, 0 }, { "max-chain-skip", required_argument, 0, 0 }, { "min-dp-len", required_argument, 0, 0 }, @@ -37,17 +37,19 @@ static struct option long_options[] = { { "cost-non-gt-ag", required_argument, 0, 'C' }, { "no-long-join", no_argument, 0, 0 }, { "sr", no_argument, 0, 0 }, - { "frag", optional_argument, 0, 0 }, - { "secondary", optional_argument, 0, 0 }, + { "frag", required_argument, 0, 0 }, + { "secondary", required_argument, 0, 0 }, { "cs", optional_argument, 0, 0 }, { "end-bonus", required_argument, 0, 0 }, { "no-pairing", no_argument, 0, 0 }, - { "splice-flank", optional_argument, 0, 0 }, + { "splice-flank", required_argument, 0, 0 }, { "idx-no-seq", no_argument, 0, 0 }, { "end-seed-pen", required_argument, 0, 0 }, // 21 { "for-only", no_argument, 0, 0 }, // 22 { "rev-only", no_argument, 0, 0 }, // 23 - { "heap-sort", optional_argument, 0, 0 }, // 24 + { "heap-sort", required_argument, 0, 0 }, // 24 + { "all-chain", no_argument, 0, 'P' }, + { "dual", required_argument, 0, 0 }, // 26 { "help", no_argument, 0, 'h' }, { "max-intron-len", required_argument, 0, 'G' }, { "version", no_argument, 0, 'V' }, @@ -70,9 +72,22 @@ static inline int64_t mm_parse_num(const char *str) return (int64_t)(x + .499); } +static inline void yes_or_no(mm_mapopt_t *opt, int flag, int long_idx, const char *arg, int yes_to_set) +{ + if (yes_to_set) { + if (strcmp(arg, "yes") == 0 || strcmp(arg, "y") == 0) opt->flag |= flag; + else if (strcmp(arg, "no") == 0 || strcmp(arg, "n") == 0) opt->flag &= ~flag; + else fprintf(stderr, "[WARNING]\033[1;31m option '--%s' only accepts 'yes' or 'no'.\033[0m\n", long_options[long_idx].name); + } else { + if (strcmp(arg, "yes") == 0 || strcmp(arg, "y") == 0) opt->flag &= ~flag; + else if (strcmp(arg, "no") == 0 || strcmp(arg, "n") == 0) opt->flag |= flag; + else fprintf(stderr, "[WARNING]\033[1;31m option '--%s' only accepts 'yes' or 'no'.\033[0m\n", long_options[long_idx].name); + } +} + int main(int argc, char *argv[]) { - const char *opt_str = "2aSw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Qu:R:hF:LC:"; + const char *opt_str = "2aSDw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Qu:R:hF:LC:"; mm_mapopt_t opt; mm_idxopt_t ipt; int i, c, n_threads = 3, long_idx; @@ -111,7 +126,9 @@ int main(int argc, char *argv[]) else if (c == 'p') opt.pri_ratio = atof(optarg); else if (c == 'M') opt.mask_level = atof(optarg); else if (c == 'c') opt.flag |= MM_F_OUT_CG | MM_F_CIGAR; - else if (c == 'X') opt.flag |= MM_F_AVA | MM_F_NO_SELF; + else if (c == 'D') opt.flag |= MM_F_NO_DIAG; + else if (c == 'P') opt.flag |= MM_F_ALL_CHAINS; + else if (c == 'X') opt.flag |= MM_F_ALL_CHAINS | MM_F_NO_DIAG | MM_F_NO_DUAL | MM_F_NO_LJOIN; // -D -P --no-long-join --dual=no else if (c == 'a') opt.flag |= MM_F_OUT_SAM | MM_F_CIGAR; else if (c == 'Q') opt.flag |= MM_F_NO_QUAL; else if (c == 'Y') opt.flag |= MM_F_SOFTCLIP; @@ -133,7 +150,6 @@ int main(int argc, char *argv[]) else if (c == 0 && long_idx == 2) opt.seed = atoi(optarg); // --seed else if (c == 0 && long_idx == 3) mm_dbg_flag |= MM_DBG_NO_KALLOC; // --no-kalloc else if (c == 0 && long_idx == 4) mm_dbg_flag |= MM_DBG_PRINT_QNAME; // --print-qname - else if (c == 0 && long_idx == 5) opt.flag |= MM_F_NO_SELF; // --no-self else if (c == 0 && long_idx == 6) mm_dbg_flag |= MM_DBG_PRINT_QNAME | MM_DBG_PRINT_SEED, n_threads = 1; // --print-seed else if (c == 0 && long_idx == 7) opt.max_chain_skip = atoi(optarg); // --max-chain-skip else if (c == 0 && long_idx == 8) opt.min_ksw_len = atoi(optarg); // --min-dp-len @@ -148,13 +164,9 @@ int main(int argc, char *argv[]) else if (c == 0 && long_idx ==22) opt.flag |= MM_F_FOR_ONLY; // --for-only else if (c == 0 && long_idx ==23) opt.flag |= MM_F_REV_ONLY; // --rev-only else if (c == 0 && long_idx == 14) { // --frag - if (optarg == 0 || strcmp(optarg, "yes") == 0 || strcmp(optarg, "y") == 0) - opt.flag |= MM_F_FRAG_MODE; - else opt.flag &= ~MM_F_FRAG_MODE; + yes_or_no(&opt, MM_F_FRAG_MODE, long_idx, optarg, 1); } else if (c == 0 && long_idx == 15) { // --secondary - if (optarg == 0 || strcmp(optarg, "yes") == 0 || strcmp(optarg, "y") == 0) - opt.flag &= ~MM_F_NO_PRINT_2ND; - else opt.flag |= MM_F_NO_PRINT_2ND; + yes_or_no(&opt, MM_F_NO_PRINT_2ND, long_idx, optarg, 0); } else if (c == 0 && long_idx == 16) { // --cs opt.flag |= MM_F_OUT_CS | MM_F_CIGAR; if (optarg == 0 || strcmp(optarg, "short") == 0) { @@ -167,13 +179,11 @@ int main(int argc, char *argv[]) fprintf(stderr, "[WARNING]\033[1;31m --cs only takes 'short' or 'long'. Invalid values are assumed to be 'short'.\033[0m\n"); } } else if (c == 0 && long_idx == 19) { // --splice-flank - if (optarg == 0 || strcmp(optarg, "yes") == 0 || strcmp(optarg, "y") == 0) - opt.flag |= MM_F_SPLICE_FLANK; - else opt.flag &= ~MM_F_SPLICE_FLANK; + yes_or_no(&opt, MM_F_SPLICE_FLANK, long_idx, optarg, 1); } else if (c == 0 && long_idx == 24) { // --heap-sort - if (optarg == 0 || strcmp(optarg, "yes") == 0 || strcmp(optarg, "y") == 0) - opt.flag |= MM_F_HEAP_SORT; - else opt.flag &= ~MM_F_HEAP_SORT; + yes_or_no(&opt, MM_F_HEAP_SORT, long_idx, optarg, 1); + } else if (c == 0 && long_idx == 26) { // --dual + yes_or_no(&opt, MM_F_NO_DUAL, long_idx, optarg, 0); } else if (c == 'S') { opt.flag |= MM_F_OUT_CS | MM_F_CIGAR | MM_F_OUT_CS_LONG; if (mm_verbose >= 2) @@ -261,8 +271,8 @@ int main(int argc, char *argv[]) fprintf(fp_help, " map-ont: -k15 (Oxford Nanopore vs reference mapping)\n"); fprintf(fp_help, " asm5: -k19 -w19 -A1 -B19 -O39,81 -E3,1 -s200 -z200 (asm to ref mapping; break at 5%% div.)\n"); fprintf(fp_help, " asm10: -k19 -w19 -A1 -B9 -O16,41 -E2,1 -s200 -z200 (asm to ref mapping; break at 10%% div.)\n"); - fprintf(fp_help, " ava-pb: -Hk19 -w5 -Xp0 -m100 -g10000 --max-chain-skip 25 (PacBio read overlap)\n"); - fprintf(fp_help, " ava-ont: -k15 -w5 -Xp0 -m100 -g10000 --max-chain-skip 25 (ONT read overlap)\n"); + fprintf(fp_help, " ava-pb: -Hk19 -Xw5 -m100 -g10000 --max-chain-skip 25 (PacBio read overlap)\n"); + fprintf(fp_help, " ava-ont: -k15 -Xw5 -m100 -g10000 --max-chain-skip 25 (ONT read overlap)\n"); fprintf(fp_help, " splice: long-read spliced alignment (see minimap2.1 for details)\n"); fprintf(fp_help, " sr: short single-end reads without splicing (see minimap2.1 for details)\n"); fprintf(fp_help, "\nSee `man ./minimap2.1' for detailed description of command-line options.\n"); diff --git a/map.c b/map.c index 077c25c9..ad55cf83 100644 --- a/map.c +++ b/map.c @@ -116,15 +116,15 @@ static mm_match_t *collect_matches(void *km, int *_n_m, int max_occ, const mm_id static inline int skip_seed(int flag, uint64_t r, const mm_match_t *q, const char *qname, int qlen, const mm_idx_t *mi, int *is_self) { *is_self = 0; - if (qname && (flag & (MM_F_NO_SELF|MM_F_AVA))) { + if (qname && (flag & (MM_F_NO_DIAG|MM_F_NO_DUAL))) { const mm_idx_seq_t *s = &mi->seq[r>>32]; int cmp; cmp = strcmp(qname, s->name); - if ((flag&MM_F_NO_SELF) && cmp == 0 && s->len == qlen) { + if ((flag&MM_F_NO_DIAG) && cmp == 0 && s->len == qlen) { if ((uint32_t)r>>1 == (q->q_pos>>1)) return 1; // avoid the diagnonal anchors if ((r&1) == (q->q_pos&1)) *is_self = 1; // this flag is used to avoid spurious extension on self chain } - if ((flag&MM_F_AVA) && cmp > 0) // all-vs-all mode: map once + if ((flag&MM_F_NO_DUAL) && cmp > 0) // all-vs-all mode: map once return 1; } if (flag & (MM_F_FOR_ONLY|MM_F_REV_ONLY)) { @@ -237,11 +237,11 @@ static mm128_t *collect_seed_hits(void *km, const mm_mapopt_t *opt, int max_occ, static void chain_post(const mm_mapopt_t *opt, int max_chain_gap_ref, const mm_idx_t *mi, void *km, int qlen, int n_segs, const int *qlens, int *n_regs, mm_reg1_t *regs, mm128_t *a) { - if (!(opt->flag & MM_F_AVA)) { // don't choose primary mapping(s) for read overlap + if (!(opt->flag & MM_F_ALL_CHAINS)) { // don't choose primary mapping(s) mm_set_parent(km, opt->mask_level, *n_regs, regs, opt->a * 2 + opt->b); if (n_segs <= 1) mm_select_sub(km, opt->pri_ratio, mi->k*2, opt->best_n, n_regs, regs); else mm_select_sub_multi(km, opt->pri_ratio, 0.2f, 0.7f, max_chain_gap_ref, mi->k*2, opt->best_n, n_segs, qlens, n_regs, regs); - if (!(opt->flag & MM_F_SPLICE) && !(opt->flag & MM_F_SR) && !(opt->flag & MM_F_NO_LJOIN)) + if (!(opt->flag & (MM_F_SPLICE|MM_F_SR|MM_F_NO_LJOIN))) // long join not working well without primary chains mm_join_long(km, opt, qlen, n_regs, regs, a); } } @@ -250,7 +250,7 @@ static mm_reg1_t *align_regs(const mm_mapopt_t *opt, const mm_idx_t *mi, void *k { if (!(opt->flag & MM_F_CIGAR)) return regs; regs = mm_align_skeleton(km, opt, mi, qlen, seq, n_regs, regs, a); // this calls mm_filter_regs() - if (!(opt->flag & MM_F_AVA)) { + if (!(opt->flag & MM_F_ALL_CHAINS)) { // don't choose primary mapping(s) mm_set_parent(km, opt->mask_level, *n_regs, regs, opt->a * 2 + opt->b); mm_select_sub(km, opt->pri_ratio, mi->k*2, opt->best_n, n_regs, regs); mm_set_sam_pri(*n_regs, regs); diff --git a/minimap.h b/minimap.h index 04e32098..69073878 100644 --- a/minimap.h +++ b/minimap.h @@ -5,20 +5,20 @@ #include #include -#define MM_F_NO_SELF 0x001 -#define MM_F_AVA 0x002 -#define MM_F_CIGAR 0x004 -#define MM_F_OUT_SAM 0x008 -#define MM_F_NO_QUAL 0x010 -#define MM_F_OUT_CG 0x020 -#define MM_F_OUT_CS 0x040 -#define MM_F_SPLICE 0x080 // splice mode -#define MM_F_SPLICE_FOR 0x100 // match GT-AG -#define MM_F_SPLICE_REV 0x200 // match CT-AC, the reverse complement of GT-AG -#define MM_F_NO_LJOIN 0x400 -#define MM_F_OUT_CS_LONG 0x800 -#define MM_F_SR 0x1000 -#define MM_F_FRAG_MODE 0x2000 +#define MM_F_NO_DIAG 0x001 // no exact diagonal hit +#define MM_F_NO_DUAL 0x002 // skip pairs where query name is lexicographically larger than target name +#define MM_F_CIGAR 0x004 +#define MM_F_OUT_SAM 0x008 +#define MM_F_NO_QUAL 0x010 +#define MM_F_OUT_CG 0x020 +#define MM_F_OUT_CS 0x040 +#define MM_F_SPLICE 0x080 // splice mode +#define MM_F_SPLICE_FOR 0x100 // match GT-AG +#define MM_F_SPLICE_REV 0x200 // match CT-AC, the reverse complement of GT-AG +#define MM_F_NO_LJOIN 0x400 +#define MM_F_OUT_CS_LONG 0x800 +#define MM_F_SR 0x1000 +#define MM_F_FRAG_MODE 0x2000 #define MM_F_NO_PRINT_2ND 0x4000 #define MM_F_2_IO_THREADS 0x8000 #define MM_F_LONG_CIGAR 0x10000 @@ -28,6 +28,7 @@ #define MM_F_FOR_ONLY 0x100000 #define MM_F_REV_ONLY 0x200000 #define MM_F_HEAP_SORT 0x400000 +#define MM_F_ALL_CHAINS 0x800000 #define MM_I_HPC 0x1 #define MM_I_NO_SEQ 0x2 diff --git a/minimap2.1 b/minimap2.1 index 7bc0e6c6..e8c1ad82 100644 --- a/minimap2.1 +++ b/minimap2.1 @@ -1,4 +1,4 @@ -.TH minimap2 1 "26 January 2018" "minimap2-2.7-dirty (r664)" "Bioinformatics tools" +.TH minimap2 1 "31 January 2018" "minimap2-2.8-dirty (r671)" "Bioinformatics tools" .SH NAME .PP minimap2 - mapping and alignment between collections of DNA sequences @@ -129,7 +129,7 @@ Ignore top fraction of most frequent minimizers [0.0002] .TP .BI -g \ INT -Stop chain enlongation if there are no minimizers in +Stop chain enlongation if there are no minimizers within .IR INT -bp [10000]. .TP @@ -148,11 +148,28 @@ Discard chains with chaining score [40]. Chaining score equals the approximate number of matching bases minus a concave gap penalty. It is computed with dynamic programming. .TP +.B -D +If query sequence name/length are identical to the target name/length, ignore +diagonal anchors. This option also reduces DP-based extension along the +diagonal. +.TP +.B -P +Retain all chains and don't attempt to set primary chains. Options +.B -p +and +.B -N +have no effect when this option is in use. +.TP +.BR --dual = yes | no +During chaining, whether to skip pairs wherein the query name is +lexicographically greater than the target name [yes] +.TP .B -X -Perform all-vs-all mapping. In this mode, if the query sequence name is -lexicographically larger than the target sequence name, the hits between them -will be suppressed; if the query sequence name is the same as the target name, -diagonal minimizer hits will also be suppressed. +Equivalent to +.RB ' -DP +.BR --dual = no +.BR --no-long-join '. +Primarily used for all-vs-all read overlapping. .TP .BI -p \ FLOAT Minimal secondary-to-primary score ratio to output secondary mappings [0.8]. @@ -162,6 +179,9 @@ the chain with a lower score is secondary to the chain with a higher score. If the ratio of the scores is below .IR FLOAT , the secondary chain will not be outputted or extended with DP alignment later. +This option has no effect when +.B -X +is applied. .TP .BI -N \ INT Output at most @@ -179,7 +199,7 @@ Increasing this option slows down spliced alignment. [200k] .TP .BI -F \ NUM Maximum fragment length (aka insert size; effective with -.BR -xsr / --frag) +.BR -xsr / --frag = yes ) [800] .TP .BI -M \ FLOAT @@ -209,7 +229,7 @@ applies a second round of chaining with a higher minimizer occurrence threshold if no good chain is found. In addition, minimap2 attempts to patch gaps between seeds with ungapped alignment. .TP -.BR --frag [= no | yes ] +.BR --frag = no | yes Whether to enable the fragment mode [no] .TP .B --for-only @@ -220,7 +240,7 @@ strand of the reference and the second read to the reverse stand. .B --rev-only Only map to the reverse complement strand of the reference sequences. .TP -.BR --heap-sort [= no | yes ] +.BR --heap-sort = no | yes If yes, sort anchors with heap merge, instead of radix sort. Heap merge is faster for short reads, but slower for long reads. [no] .SS Alignment options @@ -272,14 +292,13 @@ no attempt to match GT-AG [n] .BI --end-bonus \ INT Score bonus when alignment extends to the end of the query sequence [0]. .TP -.BR --splice-flank [= yes | no ] +.BR --splice-flank = yes | no Assume the next base to a .B GT donor site tends to be A/G (91% in human and 92% in mouse) and the preceding base to a .B AG -acceptor tends to be C/T [yes with -.BR --splice ]. +acceptor tends to be C/T [no]. This trend is evolutionarily conservative, all the way to S. cerevisiae (PMID:18688272). Specifying this option generally leads to higher junction accuracy by several percents, so it is applied by default with @@ -369,7 +388,7 @@ K/M/G/k/m/g suffix is accepted. A large helps load balancing in the multi-threading mode, at the cost of increased memory. .TP -.BR --secondary [= yes | no ] +.BR --secondary = yes | no Whether to output secondary alignments [yes] .TP .B --version @@ -416,13 +435,13 @@ Up to 10% sequence divergence. .B ava-pb PacBio all-vs-all overlap mapping .RB ( -Hk19 -.B -w5 -Xp0 -m100 -g10000 --max-chain-skip +.B -Xw5 -m100 -g10000 --max-chain-skip .BR 25 ). .TP .B ava-ont Oxford Nanopore all-vs-all overlap mapping .RB ( -k15 -.B -w5 -Xp0 -m100 -g10000 --max-chain-skip +.B -Xw5 -m100 -g10000 --max-chain-skip .BR 25 ). Similarly, the major difference from .B ava-pb @@ -444,8 +463,8 @@ tag ignores introns to demote hits to pseudogenes. .B sr Short single-end reads without splicing .RB ( -k21 -.B -w11 --sr --frag -A2 -B8 -O12,32 -E2,1 -r50 -p.5 -N20 -f1000,5000 -n2 -m20 -.B -s40 -g200 -2K50m --heap-sort +.B -w11 --sr --frag=yes -A2 -B8 -O12,32 -E2,1 -r50 -p.5 -N20 -f1000,5000 -n2 -m20 +.B -s40 -g200 -2K50m --heap-sort=yes .BR --secondary=no ). .RE .SS Miscellaneous options @@ -539,8 +558,8 @@ where seed positions may be suboptimal. This should not be a big concern because even the optimal alignment may be wrong in such regions. .TP * -Minimap2 requires SSE2 instructions to compile. It is possible to add -non-SSE2 support, but it would make minimap2 slower by several times. +Minimap2 requires SSE2 or NEON instructions to compile. It is possible to add +non-SSE2/NEON support, but it would make minimap2 slower by several times. .SH SEE ALSO .PP miniasm(1), minimap(1), bwa(1). diff --git a/options.c b/options.c index bafdb5ae..4b67ab24 100644 --- a/options.c +++ b/options.c @@ -58,11 +58,11 @@ int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo) mm_mapopt_init(mo); } else if (strcmp(preset, "ava-ont") == 0) { io->flag = 0, io->k = 15, io->w = 5; - mo->flag |= MM_F_AVA | MM_F_NO_SELF; + mo->flag |= MM_F_ALL_CHAINS | MM_F_NO_DIAG | MM_F_NO_DUAL | MM_F_NO_LJOIN; mo->min_chain_score = 100, mo->pri_ratio = 0.0f, mo->max_gap = 10000, mo->max_chain_skip = 25; } else if (strcmp(preset, "ava-pb") == 0) { io->flag |= MM_I_HPC, io->k = 19, io->w = 5; - mo->flag |= MM_F_AVA | MM_F_NO_SELF; + mo->flag |= MM_F_ALL_CHAINS | MM_F_NO_DIAG | MM_F_NO_DUAL | MM_F_NO_LJOIN; mo->min_chain_score = 100, mo->pri_ratio = 0.0f, mo->max_gap = 10000, mo->max_chain_skip = 25; } else if (strcmp(preset, "map10k") == 0 || strcmp(preset, "map-pb") == 0) { io->flag |= MM_I_HPC, io->k = 19;