Skip to content

Commit

Permalink
Update and bug fix
Browse files Browse the repository at this point in the history
Update: PoGo is now compatible with Ensembl and Gencode annotations.
Bug fix: reverse strand proteincoding features do not require CDS entries to be oriented based on translation direction but can be sorted in ascending coordinate direction as well.
  • Loading branch information
cschlaffner committed Sep 16, 2019
1 parent 913a91a commit fd39823
Show file tree
Hide file tree
Showing 6 changed files with 202 additions and 112 deletions.
32 changes: 25 additions & 7 deletions PoGo/src/Coordinates.h
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,11 @@ struct GenomeCoordinates : Coordinates {
return true;
}
if (lhs.chr.getValue() == rhs.chr.getValue()) {
return lhs.start < rhs.start && lhs.end < rhs.end && lhs.end >= rhs.start;
if ((lhs.strand == Strand::rev && rhs.strand == Strand::rev) || (lhs.strand == Strand::unk && rhs.strand == Strand::rev) || (lhs.strand == Strand::rev && rhs.strand == Strand::unk)) {
return lhs.start > rhs.start && lhs.end > rhs.end && lhs.start >= rhs.end;
} else {
return lhs.start < rhs.start && lhs.end < rhs.end && lhs.end <= rhs.start;
}
}
return lhs.chr.getValue() < rhs.chr.getValue();
}
Expand All @@ -102,10 +106,17 @@ struct GenomeCoordinates : Coordinates {

bool operator<(const GenomeCoordinates& rhs) const {
if (chr.isScaffold() && rhs.chr.isScaffold() && chrscaf == rhs.chrscaf) {
if (start == rhs.start) {
return end < rhs.end;
if ((strand == Strand::rev && rhs.strand == Strand::rev) || (strand == Strand::unk && rhs.strand == Strand::rev) || (strand == Strand::rev && rhs.strand == Strand::unk)) {
if (end == rhs.end) {
return start > rhs.start;
}
return end > rhs.end;
} else {
if (start == rhs.start) {
return end < rhs.end;
}
return start < rhs.start;
}
return start < rhs.start;
}
if (chr.isScaffold() && rhs.chr.isScaffold() && chrscaf != rhs.chrscaf) {
return chrscaf < rhs.chrscaf;
Expand All @@ -117,10 +128,17 @@ struct GenomeCoordinates : Coordinates {
return true;
}
if (chr.getValue() == rhs.chr.getValue()) {
if (start == rhs.start) {
return end < rhs.end;
if ((strand == Strand::rev && rhs.strand == Strand::rev) || (strand == Strand::unk && rhs.strand == Strand::rev) || (strand == Strand::rev && rhs.strand == Strand::unk)) {
if (end == rhs.end) {
return start > rhs.start;
}
return end > rhs.end;
} else {
if (start == rhs.start) {
return end < rhs.end;
}
return start < rhs.start;
}
return start < rhs.start;
}
return chr.getValue() < rhs.chr.getValue();
}
Expand Down
222 changes: 133 additions & 89 deletions PoGo/src/GTFParser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,14 @@ bool GTFParser::is_first_strand(std::vector<std::string> const& tokens) {
return tokens.at(6).compare("+") == 0;
}

bool GTFParser::is_first_strand(std::string const& token) {
return token.compare("+") == 0;
}

bool GTFParser::is_first_strand(Strand const& token) {
return token == Strand::fwd;
}

bool GTFParser::is_cds(std::vector<std::string> const& tokens) {
return tokens.at(2).compare("CDS") == 0;
}
Expand All @@ -50,6 +58,121 @@ bool GTFParser::is_next_gene(std::vector<std::string> const& tokens) {
return tokens.at(2).compare("gene") == 0;
}

void GTFParser::protein_exons_combine(CoordinateMapType & coordinates_map, std::list<GenomeCoordinates> & CDS_coords) {
CDS_coords.sort(GenomeCoordinates());
Coordinates protein_coordinates = Coordinates();
Coordinates prev_proteint_coordinates = Coordinates();
prev_proteint_coordinates = Coordinates();
prev_proteint_coordinates.Cterm = off3;
prev_proteint_coordinates.Nterm = off3;
prev_proteint_coordinates.start = 0;
prev_proteint_coordinates.end = 0;

for (std::list<GenomeCoordinates>::iterator it = CDS_coords.begin(); it != CDS_coords.end(); ++it) {
protein_exons_combine(protein_coordinates, prev_proteint_coordinates, (*it), coordinates_map);
}

}

void GTFParser::protein_exons_combine(Coordinates & protein_coordinates, Coordinates & prev_proteint_coordinates, GenomeCoordinates & genCoord, CoordinateMapType & coordinates_map) {

protein_coordinates = Coordinates();
// get nterm from prev exon
if (genCoord.frame != unknown) {
protein_coordinates.Nterm = Offset(int(genCoord.frame));
}
else {
if (prev_proteint_coordinates.Cterm != off3) {
protein_coordinates.Nterm = Offset(3 - prev_proteint_coordinates.Cterm);
}
else {
protein_coordinates.Nterm = off3;
}
}

int length = 0;

if (is_first_strand(genCoord.strand)) {
length = genCoord.end - genCoord.start + 1;
}
else if (!is_first_strand(genCoord.strand)) {
length = genCoord.start - genCoord.end + 1;
}

// calc cterm
if (length % 3 == 0) {
if (protein_coordinates.Nterm != off3) {
protein_coordinates.Cterm = Offset(3 - protein_coordinates.Nterm);
}
else {
protein_coordinates.Cterm = off3;
}
}
else if (length % 3 == 2) {
if (protein_coordinates.Nterm == off3) {
protein_coordinates.Cterm = off2;
}
else if (protein_coordinates.Nterm == off2) {
protein_coordinates.Cterm = off3;
}
else if (protein_coordinates.Nterm == off1) {
protein_coordinates.Cterm = off1;
}
}
else if (length % 3 == 1) {
if (protein_coordinates.Nterm == off3) {
protein_coordinates.Cterm = off1;
}
else if (protein_coordinates.Nterm == off1) {
protein_coordinates.Cterm = off3;
}
else if (protein_coordinates.Nterm == off2) {
protein_coordinates.Cterm = off2;
}
}

// calc protein coordinates
if (protein_coordinates.Nterm != off3) {
protein_coordinates.start = prev_proteint_coordinates.end;
}
else {
if (prev_proteint_coordinates.end == 0 && coordinates_map.empty()) {
protein_coordinates.start = 0;
}
else {
protein_coordinates.start = prev_proteint_coordinates.end + 1;
}
}

int offsets = 0;
if (protein_coordinates.Nterm != off3) {
offsets = offsets + protein_coordinates.Nterm;
}

if (is_first_strand(genCoord.strand)) {
length = genCoord.end - genCoord.start + 1 - offsets;
}
else if (!is_first_strand(genCoord.strand)) {
length = genCoord.start - genCoord.end + 1 - offsets;
}

int peplength = length / 3;

int pepend = protein_coordinates.start + peplength - 1;
if (protein_coordinates.Cterm != off3) {
pepend = pepend + 1;
}
if (protein_coordinates.Nterm != off3) {
pepend = pepend + 1;
}

protein_coordinates.end = pepend;

prev_proteint_coordinates = protein_coordinates;

coordinates_map.insert(CoordinateMapType::value_type(protein_coordinates, genCoord));
}

assembly GTFParser::read(const std::string& file, CoordinateWrapper& coordwrapper, MappedPeptides& mapping) {
if (!open(file)) {
throw GTFParser__file_not_found_exception();
Expand All @@ -61,10 +184,10 @@ assembly GTFParser::read(const std::string& file, CoordinateWrapper& coordwrappe
Coordinates prev_proteint_coordinates = Coordinates();
assembly assem = none;
std::vector<std::string> tokens;
std::list<GenomeCoordinates> CDS_coords;
while (std::getline(m_ifstream, m_line)) {
if ((m_line[0] != '#')) {
tokenize(m_line, tokens, "\t");

if (is_next_gene(tokens)) {
assembly assemtemp = mapping.add_gene_from_gtf(m_line);
if (assem == none) {
Expand All @@ -74,6 +197,10 @@ assembly GTFParser::read(const std::string& file, CoordinateWrapper& coordwrappe
}
}
if (is_next_transcript(tokens)) {
if (p_protein_entry != nullptr && !CDS_coords.empty()) {
protein_exons_combine(coordinates_map, CDS_coords);
}
CDS_coords.clear();
mapping.add_transcript_id_to_gene(m_line);
if (p_protein_entry != nullptr) {
p_protein_entry->set_coordinate_map(coordinates_map);
Expand All @@ -83,12 +210,6 @@ assembly GTFParser::read(const std::string& file, CoordinateWrapper& coordwrappe
std::cout << "ERROR: No entry for with transcript ID: " << GeneEntry::extract_transcript_id(m_line, GENOME_MAPPER_GLOBALS::ID::ID_VERSION_INCLUDE) << "\n";
continue;
}
protein_coordinates = Coordinates();
prev_proteint_coordinates = Coordinates();
prev_proteint_coordinates.Cterm = off3;
prev_proteint_coordinates.Nterm = off3;
prev_proteint_coordinates.start = 0;
prev_proteint_coordinates.end = 0;
coordinates_map = CoordinateMapType();
}
else if (is_exon(tokens)) {
Expand All @@ -102,92 +223,15 @@ assembly GTFParser::read(const std::string& file, CoordinateWrapper& coordwrappe
tmp_exonId = exonId;
}
genCoord.exonid = tmp_exonId;
protein_coordinates = Coordinates();
// get nterm from prev exon
if (genCoord.frame != unknown) {
protein_coordinates.Nterm = Offset(int(genCoord.frame));
} else {
if (prev_proteint_coordinates.Cterm != off3) {
protein_coordinates.Nterm = Offset(3 - prev_proteint_coordinates.Cterm);
} else {
protein_coordinates.Nterm = off3;
}
}

int length = 0;

if (is_first_strand(tokens)) {
length = genCoord.end - genCoord.start + 1;
} else if (!is_first_strand(tokens)) {
length = genCoord.start - genCoord.end + 1;
}

// calc cterm
if (length % 3 == 0) {
if (protein_coordinates.Nterm != off3) {
protein_coordinates.Cterm = Offset(3 - protein_coordinates.Nterm);
} else {
protein_coordinates.Cterm = off3;
}
} else if (length % 3 == 2) {
if (protein_coordinates.Nterm == off3) {
protein_coordinates.Cterm = off2;
} else if (protein_coordinates.Nterm == off2) {
protein_coordinates.Cterm = off3;
} else if (protein_coordinates.Nterm == off1) {
protein_coordinates.Cterm = off1;
}
} else if (length % 3 == 1) {
if (protein_coordinates.Nterm == off3) {
protein_coordinates.Cterm = off1;
} else if (protein_coordinates.Nterm == off1) {
protein_coordinates.Cterm = off3;
} else if (protein_coordinates.Nterm == off2) {
protein_coordinates.Cterm = off2;
}
}

// calc protein coordinates
if (protein_coordinates.Nterm != off3) {
protein_coordinates.start = prev_proteint_coordinates.end;
} else {
if (prev_proteint_coordinates.end == 0 && coordinates_map.empty()) {
protein_coordinates.start = 0;
} else {
protein_coordinates.start = prev_proteint_coordinates.end + 1;
}
}

int offsets = 0;
if (protein_coordinates.Nterm != off3) {
offsets = offsets + protein_coordinates.Nterm;
}

if (is_first_strand(tokens)) {
length = genCoord.end - genCoord.start + 1 - offsets;
} else if (!is_first_strand(tokens)) {
length = genCoord.start - genCoord.end + 1 - offsets;
}

int peplength = length / 3;

int pepend = protein_coordinates.start + peplength - 1;
if (protein_coordinates.Cterm != off3) {
pepend = pepend + 1;
}
if (protein_coordinates.Nterm != off3) {
pepend = pepend + 1;
}

protein_coordinates.end = pepend;

prev_proteint_coordinates = protein_coordinates;

coordinates_map.insert(CoordinateMapType::value_type(protein_coordinates, genCoord));
CDS_coords.push_back(genCoord);
}
}
tokens.clear();
}
if (p_protein_entry != nullptr && !CDS_coords.empty()) {
protein_exons_combine(coordinates_map, CDS_coords);
}
CDS_coords.clear();
if (p_protein_entry != nullptr) {
p_protein_entry->set_coordinate_map(coordinates_map);
}
Expand Down
7 changes: 7 additions & 0 deletions PoGo/src/GTFParser.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,15 @@ class GTFParser {
//closes the filestream
void close();

void protein_exons_combine(Coordinates & protein_coordinates, Coordinates & prev_proteint_coordinates, GenomeCoordinates & genCoord, CoordinateMapType & coordinates_map);
void protein_exons_combine(CoordinateMapType & coordinates_map, std::list<GenomeCoordinates> & CDS_coords);

//returns true if in the GTF at position 6 there is a + (plus strand)
bool static is_first_strand(std::vector<std::string> const& tokens);
//returns true if the token string is a + (plus strand)
bool static is_first_strand(std::string const& token);
//returns true if strand is fwd or 1
bool static is_first_strand(Strand const& token);
//returns true if position 2 in the GTF says "CDS"
bool static is_cds(std::vector<std::string> const& tokens);
//returns true if position 2 in the GTF says "exon"
Expand Down
33 changes: 21 additions & 12 deletions PoGo/src/GeneEntry.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -59,36 +59,45 @@ bool GeneEntry::is_patchhaploscaff() {
}

std::string GeneEntry::extract_gene_id(std::string gtfGeneLine, bool versionincl) {
std::size_t index = gtfGeneLine.find(GENOME_MAPPER_GLOBALS::ID::GTF_GENE_ID) + GENOME_MAPPER_GLOBALS::ID::GTF_GENE_ID.length();
std::size_t index = gtfGeneLine.find(GENOME_MAPPER_GLOBALS::ID::GTF_GENE_ID);
std::string value = "";
if (index != std::string::npos) {
while (gtfGeneLine[index] != '\"' && (versionincl || gtfGeneLine[index] != '.')) {
value = value + gtfGeneLine[index];
index += 1;
index = index + GENOME_MAPPER_GLOBALS::ID::GTF_GENE_ID.length();
if (index != std::string::npos) {
while (gtfGeneLine[index] != '\"' && (versionincl || gtfGeneLine[index] != '.')) {
value = value + gtfGeneLine[index];
index += 1;
}
}
}
return value;
}

std::string GeneEntry::extract_transcript_id(std::string gtfGeneLine, bool versionincl) {
std::size_t index = gtfGeneLine.find(GENOME_MAPPER_GLOBALS::ID::GTF_TRANSCRIPT_ID) + GENOME_MAPPER_GLOBALS::ID::GTF_TRANSCRIPT_ID.length();
std::size_t index = gtfGeneLine.find(GENOME_MAPPER_GLOBALS::ID::GTF_TRANSCRIPT_ID);
std::string value = "";
if (index != std::string::npos) {
while (gtfGeneLine[index] != '\"' && (versionincl || gtfGeneLine[index] != '.')) {
value = value + gtfGeneLine[index];
index += 1;
index = index + GENOME_MAPPER_GLOBALS::ID::GTF_TRANSCRIPT_ID.length();
if (index != std::string::npos) {
while (gtfGeneLine[index] != '\"' && (versionincl || gtfGeneLine[index] != '.')) {
value = value + gtfGeneLine[index];
index += 1;
}
}
}
return value;
}

std::string GeneEntry::extract_exon_id(std::string gtfGeneLine, bool versionincl) {
std::size_t index = gtfGeneLine.find(GENOME_MAPPER_GLOBALS::ID::GTF_EXON_ID) + GENOME_MAPPER_GLOBALS::ID::GTF_EXON_ID.length();
std::size_t index = gtfGeneLine.find(GENOME_MAPPER_GLOBALS::ID::GTF_EXON_ID);
std::string value = "";
if (index != std::string::npos) {
while (gtfGeneLine[index] != '\"' && (versionincl || gtfGeneLine[index] != '.')) {
value = value + gtfGeneLine[index];
index += 1;
index = index + GENOME_MAPPER_GLOBALS::ID::GTF_EXON_ID.length();
if (index != std::string::npos) {
while (gtfGeneLine[index] != '\"' && (versionincl || gtfGeneLine[index] != '.')) {
value = value + gtfGeneLine[index];
index += 1;
}
}
}
return value;
Expand Down
Loading

0 comments on commit fd39823

Please sign in to comment.