Skip to content

Commit

Permalink
Merge pull request #73 from molgenis/beta
Browse files Browse the repository at this point in the history
Merge working Beta branch into Master for new release.
  • Loading branch information
MatthieuBeukers authored Oct 18, 2019
2 parents 2d3080f + a896f9e commit 90d59ac
Show file tree
Hide file tree
Showing 33 changed files with 5,725 additions and 752 deletions.
294 changes: 270 additions & 24 deletions DonorBamRead.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,88 +2,312 @@


class DonorBamRead:
# Saves the required BAM read data.
def __init__(self, readid, readpn, readchrom, readstart,
readlen, readseq, readquals, mapqual):
"""The DonorBamRead is used to save data from an aligned read extracted from a BAM or CRAM file.
DonorBamRead can be used to save the data from aligned reads that were extracted from a BAM or CRAM file.
Attributes
----------
bam_read_id : str
Read identifier
bam_read_flag : int
The read bitwise flag value
bam_read_pairnum : str
Read pair number (1 or 2)
bam_read_chrom : str
Chromosome name the read is located on
bam_read_ref_pos : int
Leftmost genomic position of the read
bam_read_length : int
Read length
bam_read_end_pos : int
Read rightmost position (determined by alignment)
bam_read_cigar : str
Read CIGAR string
bam_read_rnext : str
Chromosome name of the read mate
bam_read_pnext : int
Leftmost genomic position of the read mate
bam_read_tlen : int
Read TLEN value
bam_read_seq : str
Read sequence
bam_read_qual : str
Read ASCII quality
bam_read_map_qual : int
Read MAPQ value
"""

def __init__(self, readid, readflag, readpn, readchrom, readstart, readlen, readend, readcigar, readrnext,
readpnext, readtlen, readseq, readquals, readmapq):
"""Saves all required data of an aligned read from a BAM or CRAM file.
Parameters
----------
readid : str
The read identifier
readflag : str
Aligned read bitwise flag
readpn : str
The read pair number
readchrom : str
The chromosome name the read is located on
readstart : int
The leftmost genomic position of the mapped read
readlen : int
The total length of the read
readcigar : str
Aligned read CIGAR string
readrnext : str
Chromsome name the read mate is located on
readpnext : int
Leftmost genomic position of the read mate
readtlen : int
Aligned read TLEN value
readseq : str
The read sequence
reqdquals : str
The qualities of the read in ascii
mapqual : int
The MAPQ mapping quality
"""
self.bam_read_id = readid
self.bam_read_flag = readflag
self.bam_read_pairnum = readpn
self.bam_read_chrom = readchrom
self.bam_read_ref_pos = readstart
self.bam_read_length = readlen
self.bam_read_end_pos = readend
self.bam_read_cigar = readcigar
self.bam_read_rnext = readrnext
self.bam_read_pnext = readpnext
self.bam_read_tlen = readtlen
self.bam_read_seq = readseq
self.bam_read_qual = readquals
self.bam_read_map_qual = mapqual
self.bam_read_map_qual = readmapq

# ===METHODS TO GET SAVED DATA FROM THE DONORBAMREAD=======================
# Returns the BAM read identifier.
def get_bam_read_id(self):
"""Returns the identifier of the read.
Returns
-------
self.bam_read_id : str
The read identifier
"""
return self.bam_read_id

# Returns the BAM read pair number (1 or 2).
def get_bam_read_flag(self):
"""Returns the read alignment flag.
Returns
-------
self.bam_read_flag : str
Read alignment flag
"""

def get_bam_read_pair_number(self):
"""Returns the pair number of the read.
This is 1 for forward, or R1, reads and 2 for reverse, or R2, reads.
Returns
-------
self.bam_read_pairnum : str
The read pair number
"""
return self.bam_read_pairnum

# Returns the BAM read chromosome.
def get_bam_read_chrom(self):
"""Returns the name of the chromosome where the read has been mapped.
Returns
-------
self.bam_read_chrom : str
The chromosome name where the read is mapped
"""
return self.bam_read_chrom

# Returns the BAM read starting position on the reference sequence.
def get_bam_read_ref_pos(self):
"""Returns the genomic mapping position (the left most position) of the read.
Returns
-------
self.bam_read_ref_pos : int
The read leftmost genomic position
"""
return self.bam_read_ref_pos

# Returns the BAM read length.
def get_bam_read_length(self):
"""Returns the length of the read. In VaSeBuilder this length is calculated by means of the CIGAR string by
using the pysam method infer_read_length()
Returns
-------
self.bam_read_length : int
The length of the read
"""
return self.bam_read_length

# Returns the BAM read ending position on the reference (calculated
# as starting position + the length of the read).
def get_bam_read_ref_end(self):
if self.bam_read_length is not None:
return self.bam_read_ref_pos + self.bam_read_length
return -1
"""Returns the rightmost genomic position of the read.
Returns
-------
self.bam_read_end_pos : int
The rightmost position of the read
"""
return self.bam_read_end_pos

def get_bam_read_cigar(self):
"""Returns the read alignment CIGAR string
Returns
-------
self.bam_read_cigar : str
Read alignment CIGAR string
"""
return self.bam_read_cigar

def get_bam_read_rnext(self):
"""Returns the chromosome name the read mate is located on.
Returns
-------
self.bam_read_rnext : str
Chromosome name the read mate is located on
"""
return self.bam_read_rnext

def get_bam_read_pnext(self):
"""Returns the leftmost genomic position of the read mate.
Returns
-------
self.bam_read_pnext : ibnt
Leftmost genomic position of the read mate
"""
return self.bam_read_pnext

def get_bam_read_tlen(self):
"""Returns the read alignment TLEN value.
Returns
-------
self.bam_read_tlen : int
Read alignment TLEN value
"""

# Returns the BAM read sequence.
def get_bam_read_sequence(self):
"""Returns the read sequence as a String.
Returns
-------
self.bam_read_seq : str
The sequence of the read
"""
return self.bam_read_seq

# Returns the BAM read quality scores.
def get_bam_read_qual(self):
"""Returns the read quality.
Returns
-------
self.bam_read_qual : str
Read quality
"""
return self.bam_read_qual

# Returns the BAM read quality as an array of Q-Scores.
def get_bam_read_q_scores(self):
"""Converts the String of ASCII quality scores to a list Q-Scores. The Q-Score for each ASCII quality character
are calculated by obtaining the unicode code point and subtracting 33.
Returns
-------
qscores : list of int
Q-Score of the read
"""
qscores = []
for qualSymbol in self.bam_read_qual:
qscores.append(ord(qualSymbol)-33)
return qscores

# Returns the maping quality of the BAM read.
def get_mapping_qual(self):
"""Returns the mapping quality that was assigned to the read.
Returns
-------
self.bam_read_map_qual : int
The mapping quality (MAPQ)
"""
return self.bam_read_map_qual

def read_has_hardclipped_bases(self):
"""Returns whether the read has hardclipped bases.
The CIGAR string of the read is checked whether it contains an 'H' (Hardclipped bases).
Returns
-------
bool
True if read CIGAR contains 'H', False if not
"""
if self.bam_read_cigar is not None:
return "H" in self.bam_read_cigar
return False

# ===METHOD TO GET STATISTICS DATA FROM THE DONORBAMREAD===================
# Returns the average Q-Score.
def get_average_qscore(self):
"""Calculates and returns the mean Q-Score of the read.
Returns
-------
float
The mean Q-Score
"""
qscores = self.get_bam_read_q_scores()
return statistics.mean(qscores)

# Returns the median Q-Score.
def get_median_qscore(self):
"""Calculates and returns the median Q-Score of the read.
Returns
-------
int
The median Q-Score of the read
"""
qscores = self.get_bam_read_q_scores()
return statistics.median(qscores)

# ===METHODS TO CHECK WHETHER THE BAM READ IS R1 OR R2=====================
# Returns if the BAM read is the first (forward) read.
def is_read1(self):
"""Returns whether the read is the forward/R1 read by checking if the pair number is set to '1'.
Returns
-------
bool
True if the read has pair number 1, otherwise False
"""
return self.bam_read_pairnum == "1"

# Returns if the BAM read is the second (reverse) read.
def is_read2(self):
"""Returns whether the read is the reverse/R2 read by checking if the pair number is set to '2'.
Returns
-------
bool
True if the read has pair number 2, otherwise False
"""
return self.bam_read_pairnum == "2"

# ===METHODS TO RETURN A STRING REPRESENTATION OF THE DONORBAMREAD OBJECT==
# Returns a String representation.
def to_string(self):
"""Returns a String with all the saved data, separated by tabs, of the read.
Returns
-------
str
String representation of the read.
"""
return (str(self.bam_read_id) + "\t"
+ str(self.bam_read_pairnum) + "\t"
+ str(self.bam_read_chrom) + "\t"
Expand All @@ -93,8 +317,17 @@ def to_string(self):
+ str(self.bam_read_qual) + "\t"
+ str(self.bam_read_map_qual))

# Returns the BAM read as a fastq sequence.
def get_as_fastq_seq(self, addpairnum=False):
"""Returns the read as a FastQ entry.
If the 'addpairnum' argument is set to True the read pair number will be added at the end of the read
identifier as '/1' or '/2', depending on whether the read is the forward/R1 or reverse/R2 read.
Returns
-------
str
The read as FastQ entry
"""
if addpairnum:
return ("@" + str(self.bam_read_id) + "/" + str(self.bam_read_pairnum) + "\n"
+ str(self.bam_read_seq) + "\n"
Expand All @@ -105,3 +338,16 @@ def get_as_fastq_seq(self, addpairnum=False):
+ str(self.bam_read_seq) + "\n"
+ "+\n"
+ str(self.bam_read_qual) + "\n")

def get_as_bam_read(self):
"""Returns the read as a SAM/BAM/CRAM file entry.
Returns
-------
bamcram_entry : str
Read as SAM/BAM/CRAM entry
"""
bamcram_entry = f"{self.bam_read_id}\t{self.bam_read_flag}\t{self.bam_read_chrom}\t{self.bam_read_ref_pos}" \
f"\t{self.bam_read_map_qual}t{self.bam_read_cigar}\t{self.bam_read_rnext}\t{self.bam_read_pnext}" \
f"\t{self.bam_read_tlen}\t{self.bam_read_seq}\t{self.bam_read_qual}"
return bamcram_entry
Loading

0 comments on commit 90d59ac

Please sign in to comment.