From 46d6349af4d3df895e321c8cf11a7c758f0a498a Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 31 Jan 2018 11:33:08 -0500 Subject: [PATCH] r670: added PE support to mappy and minor code cleanup --- hit.c | 4 +++- main.c | 2 +- map.c | 20 +++++++++----------- minimap.h | 4 +++- python/README.rst | 10 ++++++++-- python/cmappy.h | 33 +++++++++++++++++++++++++++++++++ python/cmappy.pxd | 3 ++- python/mappy.pyx | 14 ++++++++++---- setup.py | 4 ++-- 9 files changed, 71 insertions(+), 23 deletions(-) diff --git a/hit.c b/hit.c index b30337d3..c3256b46 100644 --- a/hit.c +++ b/hit.c @@ -390,8 +390,10 @@ mm_seg_t *mm_seg_gen(void *km, uint32_t hash, int n_segs, const int *qlens, int for (s = 0; s < n_segs; ++s) { regs[s] = mm_gen_regs(km, hash, qlens[s], seg[s].n_u, seg[s].u, seg[s].a); n_regs[s] = seg[s].n_u; - for (i = 0; i < n_regs[s]; ++i) + for (i = 0; i < n_regs[s]; ++i) { regs[s][i].seg_split = 1; + regs[s][i].seg_id = s; + } } return seg; } diff --git a/main.c b/main.c index 6436a04e..34b49e0c 100644 --- a/main.c +++ b/main.c @@ -6,7 +6,7 @@ #include "mmpriv.h" #include "getopt.h" -#define MM_VERSION "2.7-r669-dirty" +#define MM_VERSION "2.7-r670-dirty" #ifdef __linux__ #include diff --git a/map.c b/map.c index d5b7dd33..077c25c9 100644 --- a/map.c +++ b/map.c @@ -246,7 +246,7 @@ static void chain_post(const mm_mapopt_t *opt, int max_chain_gap_ref, const mm_i } } -static mm_reg1_t *align_regs(const mm_mapopt_t *opt, const mm_idx_t *mi, void *km, int qlen, const char *seq, const char *qual, int *n_regs, mm_reg1_t *regs, mm128_t *a) +static mm_reg1_t *align_regs(const mm_mapopt_t *opt, const mm_idx_t *mi, void *km, int qlen, const char *seq, int *n_regs, mm_reg1_t *regs, mm128_t *a) { if (!(opt->flag & MM_F_CIGAR)) return regs; regs = mm_align_skeleton(km, opt, mi, qlen, seq, n_regs, regs, a); // this calls mm_filter_regs() @@ -258,7 +258,7 @@ static mm_reg1_t *align_regs(const mm_mapopt_t *opt, const mm_idx_t *mi, void *k return regs; } -void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char **seqs, const char **quals, int *n_regs, mm_reg1_t **regs, mm_tbuf_t *b, const mm_mapopt_t *opt, const char *qname) +void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char **seqs, int *n_regs, mm_reg1_t **regs, mm_tbuf_t *b, const mm_mapopt_t *opt, const char *qname) { int i, j, rep_len, qlen_sum, n_regs0, n_mini_pos; int max_chain_gap_qry, max_chain_gap_ref, is_splice = !!(opt->flag & MM_F_SPLICE), is_sr = !!(opt->flag & MM_F_SR); @@ -339,7 +339,7 @@ void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char ** if (!is_sr) mm_est_err(mi, qlen_sum, n_regs0, regs0, a, n_mini_pos, mini_pos); if (n_segs == 1) { // uni-segment - regs0 = align_regs(opt, mi, b->km, qlens[0], seqs[0], quals? quals[0] : 0, &n_regs0, regs0, a); + regs0 = align_regs(opt, mi, b->km, qlens[0], seqs[0], &n_regs0, regs0, a); mm_set_mapq(n_regs0, regs0, opt->min_chain_score, opt->a, rep_len, is_sr); n_regs[0] = n_regs0, regs[0] = regs0; } else { // multi-segment @@ -348,7 +348,7 @@ void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char ** free(regs0); for (i = 0; i < n_segs; ++i) { mm_set_parent(b->km, opt->mask_level, n_regs[i], regs[i], opt->a * 2 + opt->b); // update mm_reg1_t::parent - regs[i] = align_regs(opt, mi, b->km, qlens[i], seqs[i], quals? quals[i] : 0, &n_regs[i], regs[i], seg[i].a); + regs[i] = align_regs(opt, mi, b->km, qlens[i], seqs[i], &n_regs[i], regs[i], seg[i].a); mm_set_mapq(n_regs[i], regs[i], opt->min_chain_score, opt->a, rep_len, is_sr); } mm_seg_free(b->km, n_segs, seg); @@ -376,7 +376,7 @@ void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char ** mm_reg1_t *mm_map(const mm_idx_t *mi, int qlen, const char *seq, int *n_regs, mm_tbuf_t *b, const mm_mapopt_t *opt, const char *qname) { mm_reg1_t *regs; - mm_map_frag(mi, 1, &qlen, &seq, 0, n_regs, ®s, b, opt, qname); + mm_map_frag(mi, 1, &qlen, &seq, n_regs, ®s, b, opt, qname); return regs; } @@ -404,11 +404,10 @@ typedef struct { static void worker_for(void *_data, long i, int tid) // kt_for() callback { step_t *s = (step_t*)_data; - int qlens[MM_MAX_SEG], j, off = s->seg_off[i], pe_ori = s->p->opt->pe_ori, is_sr = !!(s->p->opt->flag & MM_F_SR); - const char *qseqs[MM_MAX_SEG], *quals[MM_MAX_SEG]; + int qlens[MM_MAX_SEG], j, off = s->seg_off[i], pe_ori = s->p->opt->pe_ori; + const char *qseqs[MM_MAX_SEG]; mm_tbuf_t *b = s->buf[tid]; assert(s->n_seg[i] <= MM_MAX_SEG); - memset(quals, 0, sizeof(char*) * MM_MAX_SEG); if (mm_dbg_flag & MM_DBG_PRINT_QNAME) fprintf(stderr, "QR\t%s\t%d\t%d\n", s->seq[off].name, tid, s->seq[off].l_seq); for (j = 0; j < s->n_seg[i]; ++j) { @@ -416,13 +415,12 @@ static void worker_for(void *_data, long i, int tid) // kt_for() callback mm_revcomp_bseq(&s->seq[off + j]); qlens[j] = s->seq[off + j].l_seq; qseqs[j] = s->seq[off + j].seq; - quals[j] = is_sr? s->seq[off + j].qual : 0; } if (s->p->opt->flag & MM_F_INDEPEND_SEG) { for (j = 0; j < s->n_seg[i]; ++j) - mm_map_frag(s->p->mi, 1, &qlens[j], &qseqs[j], &quals[j], &s->n_reg[off+j], &s->reg[off+j], b, s->p->opt, s->seq[off+j].name); + mm_map_frag(s->p->mi, 1, &qlens[j], &qseqs[j], &s->n_reg[off+j], &s->reg[off+j], b, s->p->opt, s->seq[off+j].name); } else { - mm_map_frag(s->p->mi, s->n_seg[i], qlens, qseqs, quals, &s->n_reg[off], &s->reg[off], b, s->p->opt, s->seq[off].name); + mm_map_frag(s->p->mi, s->n_seg[i], qlens, qseqs, &s->n_reg[off], &s->reg[off], b, s->p->opt, s->seq[off].name); } for (j = 0; j < s->n_seg[i]; ++j) // flip the query strand and coordinate to the original read strand if (s->n_seg[i] == 2 && ((j == 0 && (pe_ori>>1&1)) || (j == 1 && (pe_ori&1)))) { diff --git a/minimap.h b/minimap.h index b244639f..04e32098 100644 --- a/minimap.h +++ b/minimap.h @@ -81,7 +81,7 @@ typedef struct { int32_t mlen, blen; // seeded exact match length; seeded alignment block length int32_t n_sub; // number of suboptimal mappings int32_t score0; // initial chaining score (before chain merging/spliting) - uint32_t mapq:8, split:2, rev:1, inv:1, sam_pri:1, proper_frag:1, pe_thru:1, seg_split:1, dummy:16; + uint32_t mapq:8, split:2, rev:1, inv:1, sam_pri:1, proper_frag:1, pe_thru:1, seg_split:1, seg_id:8, dummy:8; uint32_t hash; float div; mm_extra_t *p; @@ -279,6 +279,8 @@ void mm_tbuf_destroy(mm_tbuf_t *b); */ mm_reg1_t *mm_map(const mm_idx_t *mi, int l_seq, const char *seq, int *n_regs, mm_tbuf_t *b, const mm_mapopt_t *opt, const char *name); +void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char **seqs, int *n_regs, mm_reg1_t **regs, mm_tbuf_t *b, const mm_mapopt_t *opt, const char *qname); + /** * Align a fasta/fastq file and print alignments to stdout * diff --git a/python/README.rst b/python/README.rst index bfee564f..379eef5e 100644 --- a/python/README.rst +++ b/python/README.rst @@ -81,10 +81,13 @@ This constructor accepts the following arguments: .. code:: python - mappy.Aligner.map(seq) + mappy.Aligner.map(seq, seq2=None) This method aligns :code:`seq` against the index. It is a generator, *yielding* -a series of :code:`mappy.Alignment` objects. +a series of :code:`mappy.Alignment` objects. If :code:`seq2` is present, mappy +performs paired-end alignment, assuming the two ends are in the FR orientation. +Alignments of the two ends can be distinguished by the :code:`read_num` field +(see below). Class mappy.Alignment ~~~~~~~~~~~~~~~~~~~~~ @@ -118,6 +121,9 @@ properties: * **is_primary**: if the alignment is primary (typically the best and the first to generate) +* **read_num**: read number that the alignment corresponds to; 1 for the first + read and 2 for the second read + * **cigar_str**: CIGAR string * **cigar**: CIGAR returned as an array of shape :code:`(n_cigar,2)`. The two diff --git a/python/cmappy.h b/python/cmappy.h index f6910350..dc9d509b 100644 --- a/python/cmappy.h +++ b/python/cmappy.h @@ -15,6 +15,7 @@ typedef struct { int32_t blen, mlen, NM, ctg_len; uint8_t mapq, is_primary; int8_t strand, trans_strand; + int32_t seg_id; int32_t n_cigar32; uint32_t *cigar32; } mm_hitpy_t; @@ -32,6 +33,7 @@ static inline void mm_reg2hitpy(const mm_idx_t *mi, mm_reg1_t *r, mm_hitpy_t *h) h->NM = r->blen - r->mlen + r->p->n_ambi; h->trans_strand = r->p->trans_strand == 1? 1 : r->p->trans_strand == 2? -1 : 0; h->is_primary = (r->id == r->parent); + h->seg_id = r->seg_id; h->n_cigar32 = r->p->n_cigar; h->cigar32 = r->p->cigar; } @@ -68,4 +70,35 @@ static inline void mm_reset_timer(void) mm_realtime0 = realtime(); } +extern unsigned char seq_comp_table[256]; +static inline mm_reg1_t *mm_map_aux(const mm_idx_t *mi, const char *seq1, const char *seq2, int *n_regs, mm_tbuf_t *b, const mm_mapopt_t *opt) +{ + if (seq2 == 0) { + return mm_map(mi, strlen(seq1), seq1, n_regs, b, opt, NULL); + } else { + int _n_regs[2]; + mm_reg1_t *regs[2]; + char *seq[2]; + int i, len[2]; + len[0] = strlen(seq1); + len[1] = strlen(seq2); + seq[0] = (char*)seq1; + seq[1] = strdup(seq2); + for (i = 0; i < len[1]>>1; ++i) { + int t = seq[1][len[1] - i - 1]; + seq[1][len[1] - i - 1] = seq_comp_table[(uint8_t)seq[1][i]]; + seq[1][i] = seq_comp_table[t]; + } + if (len[1]&1) seq[1][len[1]>>1] = seq_comp_table[(uint8_t)seq[1][len[1]>>1]]; + mm_map_frag(mi, 2, len, (const char**)seq, _n_regs, regs, b, opt, NULL); + for (i = 0; i < _n_regs[1]; ++i) + regs[1][i].rev = !regs[1][i].rev; + *n_regs = _n_regs[0] + _n_regs[1]; + regs[0] = (mm_reg1_t*)realloc(regs[0], sizeof(mm_reg1_t) * (*n_regs)); + memcpy(®s[0][_n_regs[0]], regs[1], _n_regs[1] * sizeof(mm_reg1_t)); + free(regs[1]); + return regs[0]; + } +} + #endif diff --git a/python/cmappy.pxd b/python/cmappy.pxd index b4f965d5..6d1b9318 100644 --- a/python/cmappy.pxd +++ b/python/cmappy.pxd @@ -79,7 +79,6 @@ cdef extern from "minimap.h": mm_tbuf_t *mm_tbuf_init() void mm_tbuf_destroy(mm_tbuf_t *b) - mm_reg1_t *mm_map(const mm_idx_t *mi, int l_seq, const char *seq, int *n_regs, mm_tbuf_t *b, const mm_mapopt_t *opt, const char *name) # # Helper header (because it is hard to expose mm_reg1_t with Cython) @@ -92,11 +91,13 @@ cdef extern from "cmappy.h": int32_t blen, mlen, NM, ctg_len uint8_t mapq, is_primary int8_t strand, trans_strand + int32_t seg_id int32_t n_cigar32 uint32_t *cigar32 void mm_reg2hitpy(const mm_idx_t *mi, mm_reg1_t *r, mm_hitpy_t *h) void mm_free_reg1(mm_reg1_t *r) + mm_reg1_t *mm_map_aux(const mm_idx_t *mi, const char *seq1, const char *seq2, int *n_regs, mm_tbuf_t *b, const mm_mapopt_t *opt) ctypedef struct kstring_t: unsigned l, m diff --git a/python/mappy.pyx b/python/mappy.pyx index 1f17dc0e..6b0e7537 100644 --- a/python/mappy.pyx +++ b/python/mappy.pyx @@ -10,9 +10,10 @@ cdef class Alignment: cdef int _NM, _mlen, _blen cdef int8_t _strand, _trans_strand cdef uint8_t _mapq, _is_primary + cdef int _seg_id cdef _ctg, _cigar # these are python objects - def __cinit__(self, ctg, cl, cs, ce, strand, qs, qe, mapq, cigar, is_primary, mlen, blen, NM, trans_strand): + def __cinit__(self, ctg, cl, cs, ce, strand, qs, qe, mapq, cigar, is_primary, mlen, blen, NM, trans_strand, seg_id): self._ctg = ctg if isinstance(ctg, str) else ctg.decode() self._ctg_len, self._r_st, self._r_en = cl, cs, ce self._strand, self._q_st, self._q_en = strand, qs, qe @@ -21,6 +22,7 @@ cdef class Alignment: self._cigar = cigar self._is_primary = is_primary self._trans_strand = trans_strand + self._seg_id = seg_id @property def ctg(self): return self._ctg @@ -64,6 +66,9 @@ cdef class Alignment: @property def cigar(self): return self._cigar + @property + def read_num(self): return self._seg_id + 1 + @property def cigar_str(self): return "".join(map(lambda x: str(x[0]) + 'MIDNSH'[x[1]], self._cigar)) @@ -125,7 +130,7 @@ cdef class Aligner: def __bool__(self): return (self._idx != NULL) - def map(self, seq, buf=None): + def map(self, seq, seq2=None, buf=None): cdef cmappy.mm_reg1_t *regs cdef cmappy.mm_hitpy_t h cdef ThreadBuffer b @@ -134,7 +139,8 @@ cdef class Aligner: if self._idx is NULL: return None if buf is None: b = ThreadBuffer() else: b = buf - regs = cmappy.mm_map(self._idx, len(seq), str.encode(seq), &n_regs, b._b, &self.map_opt, NULL) + if seq2 is None: regs = cmappy.mm_map_aux(self._idx, str.encode(seq), NULL, &n_regs, b._b, &self.map_opt) + else: regs = cmappy.mm_map_aux(self._idx, str.encode(seq), str.encode(seq2), &n_regs, b._b, &self.map_opt) for i in range(n_regs): cmappy.mm_reg2hitpy(self._idx, ®s[i], &h) @@ -142,7 +148,7 @@ cdef class Aligner: for k in range(h.n_cigar32): c = h.cigar32[k] cigar.append([c>>4, c&0xf]) - yield Alignment(h.ctg, h.ctg_len, h.ctg_start, h.ctg_end, h.strand, h.qry_start, h.qry_end, h.mapq, cigar, h.is_primary, h.mlen, h.blen, h.NM, h.trans_strand) + yield Alignment(h.ctg, h.ctg_len, h.ctg_start, h.ctg_end, h.strand, h.qry_start, h.qry_end, h.mapq, cigar, h.is_primary, h.mlen, h.blen, h.NM, h.trans_strand, h.seg_id) cmappy.mm_free_reg1(®s[i]) free(regs) diff --git a/setup.py b/setup.py index 9ed55565..01660972 100644 --- a/setup.py +++ b/setup.py @@ -23,7 +23,7 @@ def readme(): setup( name = 'mappy', - version = '2.7', + version = '2.8', url = 'https://github.com/lh3/minimap2', description = 'Minimap2 python binding', long_description = readme(), @@ -39,7 +39,7 @@ def readme(): depends = ['minimap.h', 'bseq.h', 'kalloc.h', 'kdq.h', 'khash.h', 'kseq.h', 'ksort.h', 'ksw2.h', 'kthread.h', 'kvec.h', 'mmpriv.h', 'sdust.h', 'python/cmappy.h', 'python/cmappy.pxd'], - extra_compile_args = ['-msse4'], # WARNING: ancient x86_64 CPUs don't have SSE4 + extra_compile_args = ['-DHAVE_KALLOC', '-msse4'], # WARNING: ancient x86_64 CPUs don't have SSE4 include_dirs = ['.'], libraries = ['z', 'm', 'pthread'])], classifiers = [