Skip to content

Commit

Permalink
r670: added PE support to mappy
Browse files Browse the repository at this point in the history
and minor code cleanup
  • Loading branch information
lh3 committed Jan 31, 2018
1 parent 12a5a5f commit 46d6349
Show file tree
Hide file tree
Showing 9 changed files with 71 additions and 23 deletions.
4 changes: 3 additions & 1 deletion hit.c
Original file line number Diff line number Diff line change
Expand Up @@ -390,8 +390,10 @@ mm_seg_t *mm_seg_gen(void *km, uint32_t hash, int n_segs, const int *qlens, int
for (s = 0; s < n_segs; ++s) {
regs[s] = mm_gen_regs(km, hash, qlens[s], seg[s].n_u, seg[s].u, seg[s].a);
n_regs[s] = seg[s].n_u;
for (i = 0; i < n_regs[s]; ++i)
for (i = 0; i < n_regs[s]; ++i) {
regs[s][i].seg_split = 1;
regs[s][i].seg_id = s;
}
}
return seg;
}
Expand Down
2 changes: 1 addition & 1 deletion main.c
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
#include "mmpriv.h"
#include "getopt.h"

#define MM_VERSION "2.7-r669-dirty"
#define MM_VERSION "2.7-r670-dirty"

#ifdef __linux__
#include <sys/resource.h>
Expand Down
20 changes: 9 additions & 11 deletions map.c
Original file line number Diff line number Diff line change
Expand Up @@ -246,7 +246,7 @@ static void chain_post(const mm_mapopt_t *opt, int max_chain_gap_ref, const mm_i
}
}

static mm_reg1_t *align_regs(const mm_mapopt_t *opt, const mm_idx_t *mi, void *km, int qlen, const char *seq, const char *qual, int *n_regs, mm_reg1_t *regs, mm128_t *a)
static mm_reg1_t *align_regs(const mm_mapopt_t *opt, const mm_idx_t *mi, void *km, int qlen, const char *seq, int *n_regs, mm_reg1_t *regs, mm128_t *a)
{
if (!(opt->flag & MM_F_CIGAR)) return regs;
regs = mm_align_skeleton(km, opt, mi, qlen, seq, n_regs, regs, a); // this calls mm_filter_regs()
Expand All @@ -258,7 +258,7 @@ static mm_reg1_t *align_regs(const mm_mapopt_t *opt, const mm_idx_t *mi, void *k
return regs;
}

void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char **seqs, const char **quals, int *n_regs, mm_reg1_t **regs, mm_tbuf_t *b, const mm_mapopt_t *opt, const char *qname)
void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char **seqs, int *n_regs, mm_reg1_t **regs, mm_tbuf_t *b, const mm_mapopt_t *opt, const char *qname)
{
int i, j, rep_len, qlen_sum, n_regs0, n_mini_pos;
int max_chain_gap_qry, max_chain_gap_ref, is_splice = !!(opt->flag & MM_F_SPLICE), is_sr = !!(opt->flag & MM_F_SR);
Expand Down Expand Up @@ -339,7 +339,7 @@ void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char **
if (!is_sr) mm_est_err(mi, qlen_sum, n_regs0, regs0, a, n_mini_pos, mini_pos);

if (n_segs == 1) { // uni-segment
regs0 = align_regs(opt, mi, b->km, qlens[0], seqs[0], quals? quals[0] : 0, &n_regs0, regs0, a);
regs0 = align_regs(opt, mi, b->km, qlens[0], seqs[0], &n_regs0, regs0, a);
mm_set_mapq(n_regs0, regs0, opt->min_chain_score, opt->a, rep_len, is_sr);
n_regs[0] = n_regs0, regs[0] = regs0;
} else { // multi-segment
Expand All @@ -348,7 +348,7 @@ void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char **
free(regs0);
for (i = 0; i < n_segs; ++i) {
mm_set_parent(b->km, opt->mask_level, n_regs[i], regs[i], opt->a * 2 + opt->b); // update mm_reg1_t::parent
regs[i] = align_regs(opt, mi, b->km, qlens[i], seqs[i], quals? quals[i] : 0, &n_regs[i], regs[i], seg[i].a);
regs[i] = align_regs(opt, mi, b->km, qlens[i], seqs[i], &n_regs[i], regs[i], seg[i].a);
mm_set_mapq(n_regs[i], regs[i], opt->min_chain_score, opt->a, rep_len, is_sr);
}
mm_seg_free(b->km, n_segs, seg);
Expand Down Expand Up @@ -376,7 +376,7 @@ void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char **
mm_reg1_t *mm_map(const mm_idx_t *mi, int qlen, const char *seq, int *n_regs, mm_tbuf_t *b, const mm_mapopt_t *opt, const char *qname)
{
mm_reg1_t *regs;
mm_map_frag(mi, 1, &qlen, &seq, 0, n_regs, &regs, b, opt, qname);
mm_map_frag(mi, 1, &qlen, &seq, n_regs, &regs, b, opt, qname);
return regs;
}

Expand Down Expand Up @@ -404,25 +404,23 @@ typedef struct {
static void worker_for(void *_data, long i, int tid) // kt_for() callback
{
step_t *s = (step_t*)_data;
int qlens[MM_MAX_SEG], j, off = s->seg_off[i], pe_ori = s->p->opt->pe_ori, is_sr = !!(s->p->opt->flag & MM_F_SR);
const char *qseqs[MM_MAX_SEG], *quals[MM_MAX_SEG];
int qlens[MM_MAX_SEG], j, off = s->seg_off[i], pe_ori = s->p->opt->pe_ori;
const char *qseqs[MM_MAX_SEG];
mm_tbuf_t *b = s->buf[tid];
assert(s->n_seg[i] <= MM_MAX_SEG);
memset(quals, 0, sizeof(char*) * MM_MAX_SEG);
if (mm_dbg_flag & MM_DBG_PRINT_QNAME)
fprintf(stderr, "QR\t%s\t%d\t%d\n", s->seq[off].name, tid, s->seq[off].l_seq);
for (j = 0; j < s->n_seg[i]; ++j) {
if (s->n_seg[i] == 2 && ((j == 0 && (pe_ori>>1&1)) || (j == 1 && (pe_ori&1))))
mm_revcomp_bseq(&s->seq[off + j]);
qlens[j] = s->seq[off + j].l_seq;
qseqs[j] = s->seq[off + j].seq;
quals[j] = is_sr? s->seq[off + j].qual : 0;
}
if (s->p->opt->flag & MM_F_INDEPEND_SEG) {
for (j = 0; j < s->n_seg[i]; ++j)
mm_map_frag(s->p->mi, 1, &qlens[j], &qseqs[j], &quals[j], &s->n_reg[off+j], &s->reg[off+j], b, s->p->opt, s->seq[off+j].name);
mm_map_frag(s->p->mi, 1, &qlens[j], &qseqs[j], &s->n_reg[off+j], &s->reg[off+j], b, s->p->opt, s->seq[off+j].name);
} else {
mm_map_frag(s->p->mi, s->n_seg[i], qlens, qseqs, quals, &s->n_reg[off], &s->reg[off], b, s->p->opt, s->seq[off].name);
mm_map_frag(s->p->mi, s->n_seg[i], qlens, qseqs, &s->n_reg[off], &s->reg[off], b, s->p->opt, s->seq[off].name);
}
for (j = 0; j < s->n_seg[i]; ++j) // flip the query strand and coordinate to the original read strand
if (s->n_seg[i] == 2 && ((j == 0 && (pe_ori>>1&1)) || (j == 1 && (pe_ori&1)))) {
Expand Down
4 changes: 3 additions & 1 deletion minimap.h
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ typedef struct {
int32_t mlen, blen; // seeded exact match length; seeded alignment block length
int32_t n_sub; // number of suboptimal mappings
int32_t score0; // initial chaining score (before chain merging/spliting)
uint32_t mapq:8, split:2, rev:1, inv:1, sam_pri:1, proper_frag:1, pe_thru:1, seg_split:1, dummy:16;
uint32_t mapq:8, split:2, rev:1, inv:1, sam_pri:1, proper_frag:1, pe_thru:1, seg_split:1, seg_id:8, dummy:8;
uint32_t hash;
float div;
mm_extra_t *p;
Expand Down Expand Up @@ -279,6 +279,8 @@ void mm_tbuf_destroy(mm_tbuf_t *b);
*/
mm_reg1_t *mm_map(const mm_idx_t *mi, int l_seq, const char *seq, int *n_regs, mm_tbuf_t *b, const mm_mapopt_t *opt, const char *name);

void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char **seqs, int *n_regs, mm_reg1_t **regs, mm_tbuf_t *b, const mm_mapopt_t *opt, const char *qname);

/**
* Align a fasta/fastq file and print alignments to stdout
*
Expand Down
10 changes: 8 additions & 2 deletions python/README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -81,10 +81,13 @@ This constructor accepts the following arguments:

.. code:: python
mappy.Aligner.map(seq)
mappy.Aligner.map(seq, seq2=None)
This method aligns :code:`seq` against the index. It is a generator, *yielding*
a series of :code:`mappy.Alignment` objects.
a series of :code:`mappy.Alignment` objects. If :code:`seq2` is present, mappy
performs paired-end alignment, assuming the two ends are in the FR orientation.
Alignments of the two ends can be distinguished by the :code:`read_num` field
(see below).

Class mappy.Alignment
~~~~~~~~~~~~~~~~~~~~~
Expand Down Expand Up @@ -118,6 +121,9 @@ properties:
* **is_primary**: if the alignment is primary (typically the best and the first
to generate)

* **read_num**: read number that the alignment corresponds to; 1 for the first
read and 2 for the second read

* **cigar_str**: CIGAR string

* **cigar**: CIGAR returned as an array of shape :code:`(n_cigar,2)`. The two
Expand Down
33 changes: 33 additions & 0 deletions python/cmappy.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ typedef struct {
int32_t blen, mlen, NM, ctg_len;
uint8_t mapq, is_primary;
int8_t strand, trans_strand;
int32_t seg_id;
int32_t n_cigar32;
uint32_t *cigar32;
} mm_hitpy_t;
Expand All @@ -32,6 +33,7 @@ static inline void mm_reg2hitpy(const mm_idx_t *mi, mm_reg1_t *r, mm_hitpy_t *h)
h->NM = r->blen - r->mlen + r->p->n_ambi;
h->trans_strand = r->p->trans_strand == 1? 1 : r->p->trans_strand == 2? -1 : 0;
h->is_primary = (r->id == r->parent);
h->seg_id = r->seg_id;
h->n_cigar32 = r->p->n_cigar;
h->cigar32 = r->p->cigar;
}
Expand Down Expand Up @@ -68,4 +70,35 @@ static inline void mm_reset_timer(void)
mm_realtime0 = realtime();
}

extern unsigned char seq_comp_table[256];
static inline mm_reg1_t *mm_map_aux(const mm_idx_t *mi, const char *seq1, const char *seq2, int *n_regs, mm_tbuf_t *b, const mm_mapopt_t *opt)
{
if (seq2 == 0) {
return mm_map(mi, strlen(seq1), seq1, n_regs, b, opt, NULL);
} else {
int _n_regs[2];
mm_reg1_t *regs[2];
char *seq[2];
int i, len[2];
len[0] = strlen(seq1);
len[1] = strlen(seq2);
seq[0] = (char*)seq1;
seq[1] = strdup(seq2);
for (i = 0; i < len[1]>>1; ++i) {
int t = seq[1][len[1] - i - 1];
seq[1][len[1] - i - 1] = seq_comp_table[(uint8_t)seq[1][i]];
seq[1][i] = seq_comp_table[t];
}
if (len[1]&1) seq[1][len[1]>>1] = seq_comp_table[(uint8_t)seq[1][len[1]>>1]];
mm_map_frag(mi, 2, len, (const char**)seq, _n_regs, regs, b, opt, NULL);
for (i = 0; i < _n_regs[1]; ++i)
regs[1][i].rev = !regs[1][i].rev;
*n_regs = _n_regs[0] + _n_regs[1];
regs[0] = (mm_reg1_t*)realloc(regs[0], sizeof(mm_reg1_t) * (*n_regs));
memcpy(&regs[0][_n_regs[0]], regs[1], _n_regs[1] * sizeof(mm_reg1_t));
free(regs[1]);
return regs[0];
}
}

#endif
3 changes: 2 additions & 1 deletion python/cmappy.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,6 @@ cdef extern from "minimap.h":

mm_tbuf_t *mm_tbuf_init()
void mm_tbuf_destroy(mm_tbuf_t *b)
mm_reg1_t *mm_map(const mm_idx_t *mi, int l_seq, const char *seq, int *n_regs, mm_tbuf_t *b, const mm_mapopt_t *opt, const char *name)

#
# Helper header (because it is hard to expose mm_reg1_t with Cython)
Expand All @@ -92,11 +91,13 @@ cdef extern from "cmappy.h":
int32_t blen, mlen, NM, ctg_len
uint8_t mapq, is_primary
int8_t strand, trans_strand
int32_t seg_id
int32_t n_cigar32
uint32_t *cigar32

void mm_reg2hitpy(const mm_idx_t *mi, mm_reg1_t *r, mm_hitpy_t *h)
void mm_free_reg1(mm_reg1_t *r)
mm_reg1_t *mm_map_aux(const mm_idx_t *mi, const char *seq1, const char *seq2, int *n_regs, mm_tbuf_t *b, const mm_mapopt_t *opt)

ctypedef struct kstring_t:
unsigned l, m
Expand Down
14 changes: 10 additions & 4 deletions python/mappy.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,10 @@ cdef class Alignment:
cdef int _NM, _mlen, _blen
cdef int8_t _strand, _trans_strand
cdef uint8_t _mapq, _is_primary
cdef int _seg_id
cdef _ctg, _cigar # these are python objects

def __cinit__(self, ctg, cl, cs, ce, strand, qs, qe, mapq, cigar, is_primary, mlen, blen, NM, trans_strand):
def __cinit__(self, ctg, cl, cs, ce, strand, qs, qe, mapq, cigar, is_primary, mlen, blen, NM, trans_strand, seg_id):
self._ctg = ctg if isinstance(ctg, str) else ctg.decode()
self._ctg_len, self._r_st, self._r_en = cl, cs, ce
self._strand, self._q_st, self._q_en = strand, qs, qe
Expand All @@ -21,6 +22,7 @@ cdef class Alignment:
self._cigar = cigar
self._is_primary = is_primary
self._trans_strand = trans_strand
self._seg_id = seg_id

@property
def ctg(self): return self._ctg
Expand Down Expand Up @@ -64,6 +66,9 @@ cdef class Alignment:
@property
def cigar(self): return self._cigar

@property
def read_num(self): return self._seg_id + 1

@property
def cigar_str(self):
return "".join(map(lambda x: str(x[0]) + 'MIDNSH'[x[1]], self._cigar))
Expand Down Expand Up @@ -125,7 +130,7 @@ cdef class Aligner:
def __bool__(self):
return (self._idx != NULL)

def map(self, seq, buf=None):
def map(self, seq, seq2=None, buf=None):
cdef cmappy.mm_reg1_t *regs
cdef cmappy.mm_hitpy_t h
cdef ThreadBuffer b
Expand All @@ -134,15 +139,16 @@ cdef class Aligner:
if self._idx is NULL: return None
if buf is None: b = ThreadBuffer()
else: b = buf
regs = cmappy.mm_map(self._idx, len(seq), str.encode(seq), &n_regs, b._b, &self.map_opt, NULL)
if seq2 is None: regs = cmappy.mm_map_aux(self._idx, str.encode(seq), NULL, &n_regs, b._b, &self.map_opt)
else: regs = cmappy.mm_map_aux(self._idx, str.encode(seq), str.encode(seq2), &n_regs, b._b, &self.map_opt)

for i in range(n_regs):
cmappy.mm_reg2hitpy(self._idx, &regs[i], &h)
cigar = []
for k in range(h.n_cigar32):
c = h.cigar32[k]
cigar.append([c>>4, c&0xf])
yield Alignment(h.ctg, h.ctg_len, h.ctg_start, h.ctg_end, h.strand, h.qry_start, h.qry_end, h.mapq, cigar, h.is_primary, h.mlen, h.blen, h.NM, h.trans_strand)
yield Alignment(h.ctg, h.ctg_len, h.ctg_start, h.ctg_end, h.strand, h.qry_start, h.qry_end, h.mapq, cigar, h.is_primary, h.mlen, h.blen, h.NM, h.trans_strand, h.seg_id)
cmappy.mm_free_reg1(&regs[i])
free(regs)

Expand Down
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def readme():

setup(
name = 'mappy',
version = '2.7',
version = '2.8',
url = 'https://github.com/lh3/minimap2',
description = 'Minimap2 python binding',
long_description = readme(),
Expand All @@ -39,7 +39,7 @@ def readme():
depends = ['minimap.h', 'bseq.h', 'kalloc.h', 'kdq.h', 'khash.h', 'kseq.h', 'ksort.h',
'ksw2.h', 'kthread.h', 'kvec.h', 'mmpriv.h', 'sdust.h',
'python/cmappy.h', 'python/cmappy.pxd'],
extra_compile_args = ['-msse4'], # WARNING: ancient x86_64 CPUs don't have SSE4
extra_compile_args = ['-DHAVE_KALLOC', '-msse4'], # WARNING: ancient x86_64 CPUs don't have SSE4
include_dirs = ['.'],
libraries = ['z', 'm', 'pthread'])],
classifiers = [
Expand Down

0 comments on commit 46d6349

Please sign in to comment.