Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Biopython update #157

Open
wants to merge 8 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ jobs:
fail-fast: false
matrix:
os: [macOS-latest, ubuntu-latest]
python-version: [3.7, 3.8]
python-version: ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12"]
env:
CI_OS: ${{ matrix.os }}
PYVER: ${{ matrix.python-version }}
Expand Down
2 changes: 1 addition & 1 deletion devtools/conda-envs/test_env.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ dependencies:
- tqdm
# Chimera reimplementation
- biotite
- biopython<=1.77
- biopython
# mmligner
- mmligner
# theseus
Expand Down
2 changes: 1 addition & 1 deletion devtools/conda-envs/user_env.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ dependencies:
- tqdm
# Chimera reimplementation
- biotite
- biopython<=1.77
- biopython
# mmligner
- mmligner
# theseus
Expand Down
14 changes: 10 additions & 4 deletions opencadd/databases/klifs/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,13 +140,19 @@ def _standardize_column_values(dataframe):

# TODO Use None instead of "-"; but may affect downstream pipelines that use "-" already
if "structure.alternate_model" in dataframe.columns:
dataframe["structure.alternate_model"].replace("", "-", inplace=True)
dataframe["structure.alternate_model"] = dataframe[
"structure.alternate_model"
].replace("", "-")
if "ligand.expo_id" in dataframe.columns:
dataframe["ligand.expo_id"].replace(0, "-", inplace=True)
dataframe["ligand.expo_id"] = dataframe["ligand.expo_id"].replace(0, "-")
if "ligand_allosteric.expo_id" in dataframe.columns:
dataframe["ligand_allosteric.expo_id"].replace(0, "-", inplace=True)
dataframe["ligand_allosteric.expo_id"] = dataframe[
"ligand_allosteric.expo_id"
].replace(0, "-")
if "structure.resolution" in dataframe.columns:
dataframe["structure.resolution"].replace(0, np.nan, inplace=True)
dataframe["structure.resolution"] = dataframe["structure.resolution"].replace(
0, np.nan
)

# In case of drugs
if "drug.brand_name" in dataframe.columns:
Expand Down
6 changes: 3 additions & 3 deletions opencadd/databases/klifs/local.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,9 +184,9 @@ def _from_klifs_overview_file(klifs_overview_path):
)

# Unify column 'alternate model' with corresponding column in KLIFS_export.csv
klifs_overview["structure.alternate_model"].replace( # pylint: disable=E1136
" ", "-", inplace=True
)
klifs_overview["structure.alternate_model"] = klifs_overview[
"structure.alternate_model"
].replace(" ", "-")
# Drop column for kinase name; will be taken from KLIFS_export.csv file upon table merge
klifs_overview.drop(columns=["kinase.klifs_name"], inplace=True)

Expand Down
74 changes: 46 additions & 28 deletions opencadd/databases/klifs/remote.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@ def __getstate__(self, *args):
KLIFS_CLIENT = SerializableSwaggerClient.from_url(
KLIFS_API_DEFINITIONS, config={"validate_responses": False}
)
# Maximum chunk size for input values for remote client endpoints
N_CHUNK = 10_000


class RemoteInitializer:
Expand Down Expand Up @@ -564,20 +566,28 @@ def all_interactions(self):

def by_structure_klifs_id(self, structure_klifs_ids):
structure_klifs_ids = self._ensure_list(structure_klifs_ids)
# Use KLIFS API
result = (
self._client.Interactions.get_interactions_get_IFP(structure_ID=structure_klifs_ids)
.response()
.result
)
# Convert list of ABC objects to DataFrame
interactions = self._abc_to_dataframe(result)
# Standardize DataFrame
interactions = self._standardize_dataframe(
interactions,
FIELDS.oc_name_to_type("interactions"),
FIELDS.remote_to_oc_names("interactions"),
)
interactions = []
# NOTE: Chunk IDs to avoid HTTPURITooLong
for i in range(0, len(structure_klifs_ids), N_CHUNK):
_structure_klifs_ids = structure_klifs_ids[i : i + N_CHUNK]
# Use KLIFS API
result = (
self._client.Interactions.get_interactions_get_IFP(
structure_ID=_structure_klifs_ids
)
.response()
.result
)
# Convert list of ABC objects to DataFrame
_interactions = self._abc_to_dataframe(result)
# Standardize DataFrame
_interactions = self._standardize_dataframe(
_interactions,
FIELDS.oc_name_to_type("interactions"),
FIELDS.remote_to_oc_names("interactions"),
)
interactions.append(_interactions)
interactions = pd.concat(interactions)
return interactions

def by_ligand_klifs_id(self, ligand_klifs_ids):
Expand Down Expand Up @@ -886,20 +896,28 @@ def all_conformations(self):

def by_structure_klifs_id(self, structure_klifs_ids):
structure_klifs_ids = self._ensure_list(structure_klifs_ids)
# Use KLIFS API
result = (
self._client.Structures.get_structure_conformation(structure_ID=structure_klifs_ids)
.response()
.result
)
# Convert list of ABC objects to DataFrame
conformations = self._abc_to_dataframe(result)
# Standardize DataFrame
conformations = self._standardize_dataframe(
conformations,
FIELDS.oc_name_to_type("structure_conformations"),
FIELDS.remote_to_oc_names("structure_conformations"),
)
conformations = []
# NOTE: Chunk IDs to avoid HTTPURITooLong
for i in range(0, len(structure_klifs_ids), N_CHUNK):
_structure_klifs_ids = structure_klifs_ids[i : i + N_CHUNK]
# Use KLIFS API
result = (
self._client.Structures.get_structure_conformation(
structure_ID=_structure_klifs_ids
)
.response()
.result
)
# Convert list of ABC objects to DataFrame
_conformations = self._abc_to_dataframe(result)
# Standardize DataFrame
_conformations = self._standardize_dataframe(
_conformations,
FIELDS.oc_name_to_type("structure_conformations"),
FIELDS.remote_to_oc_names("structure_conformations"),
)
conformations.append(_conformations)
conformations = pd.concat(conformations)
return conformations


Expand Down
1 change: 1 addition & 0 deletions opencadd/structure/superposition/cli.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""
Command Line Interface for opencadd
"""

import argparse
import logging

Expand Down
1 change: 1 addition & 0 deletions opencadd/structure/superposition/engines/base.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""
Base class for all the superposition engines.
"""

import logging

_logger = logging.getLogger(__name__)
Expand Down
1 change: 0 additions & 1 deletion opencadd/structure/superposition/engines/mda.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@


class MDAnalysisAligner(BaseAligner):

"""
Factory to configure an aligner based on
MDAnalysis' superposition algorithms.
Expand Down
2 changes: 1 addition & 1 deletion opencadd/structure/superposition/engines/theseus.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
* Accurate structural correlations from maximum likelihood superpositions.
Theobald, Douglas L. & Wuttke, Deborah S. (2008) PLOS Computational Biology 4(2):e43
"""

import os
import subprocess
import logging
Expand All @@ -33,7 +34,6 @@


class TheseusAligner(BaseAligner):

"""
Superpose structures with different sequences but similar structures

Expand Down
36 changes: 29 additions & 7 deletions opencadd/structure/superposition/sequences.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""
Utilities for sequence alignment
"""

import logging

import numpy as np
Expand Down Expand Up @@ -77,6 +78,27 @@ def sequence_alignment(seq1: str, seq2: str, matrix: str, gap: int, local: bool
return alignment[0]


def find_gap_character(seq):
"""
Finds the gap character of an alignment sequence

Args:
seq: str
aligned sequence

Returns:
gap character if found: str
None else
"""
if "-" in seq:
return "-"
elif "=" in seq:
return "="
else:
return ""
# raise ValueError("No standard gap character found!")


def fasta2select(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shouldn't the gap characters be removed explicitly now that BioPython doesn't recognize them?

From BioPython Documentation:

Another case where the alphabet was used was in declaring the gap character, by default - in the various Biopython sequence and alignment parsers. If you are using a different character, you will need to pass this to the Seq object .replace() method explicitly now:

# Old style
from Bio.Alphabet import generic_dna, Gapped
from Bio.Seq import Seq

my_dna = Seq("ACGT=TT", Gapped(generic_dna, "="))
print(my_dna.ungap())
# New style
from Bio.Seq import Seq

my_dna = Seq("ACGT=TT")
print(my_dna.replace("=", ""))

fastafilename,
ref_resids=None,
Expand Down Expand Up @@ -112,9 +134,8 @@ def fasta2select(
dictionary with 'reference' and 'mobile' selection string

"""
protein_gapped = Bio.Alphabet.Gapped(Bio.Alphabet.IUPAC.protein)
with open(fastafilename) as fasta:
alignment = Bio.AlignIO.read(fasta, "fasta", alphabet=protein_gapped)
alignment = Bio.AlignIO.read(fasta, "fasta")

nseq = len(alignment)
if nseq != 2:
Expand All @@ -127,7 +148,7 @@ def fasta2select(
if orig_resids[iseq] is None:
# build default: assume consecutive numbering of all
# residues in the alignment
GAP = a.seq.alphabet.gap_char
GAP = find_gap_character(a.seq)
length = len(a.seq) - a.seq.count(GAP)
orig_resids[iseq] = np.arange(1, length + 1)
else:
Expand All @@ -136,7 +157,7 @@ def fasta2select(
if orig_segids[iseq] is None:
# build default: assume consecutive numbering of all
# residues in the alignment
GAP = a.seq.alphabet.gap_char
GAP = find_gap_character(a.seq)
length = len(a.seq) - a.seq.count(GAP)
orig_segids[iseq] = np.full(length, "")
else:
Expand Down Expand Up @@ -175,7 +196,8 @@ def resid_factory(alignment, seq2resids):
t = np.zeros((nseq, alignment.get_alignment_length()), dtype=int)
s = np.zeros((nseq, alignment.get_alignment_length()), dtype=object)
for iseq, a in enumerate(alignment):
GAP = a.seq.alphabet.gap_char
print(a.seq)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Leftover from debugging?

GAP = find_gap_character(a.seq)
indices = np.cumsum(np.where(np.array(list(a.seq)) == GAP, 0, 1)) - 1
t[iseq, :] = seq2resids[iseq][0][indices]
s[iseq, :] = seq2resids[iseq][1][indices]
Expand All @@ -190,8 +212,8 @@ def resid(nseq, ipos, t=t, s=s):
res_list = [] # collect individual selection string

# should be the same for both seqs
GAP = alignment[0].seq.alphabet.gap_char
if GAP != alignment[1].seq.alphabet.gap_char:
GAP = find_gap_character(a.seq)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shouldn't this (Line 214) be GAP = find_gap_character(alignment[0].seq) instead?
a is a loop variable from above, and is simply equal to the last value of the loop iteration, which seems quite random. Also, the original lines were:

GAP = alignment[0].seq.alphabet.gap_char
if GAP != alignment[1].seq.alphabet.gap_char:

and the second line has changed to

if GAP != find_gap_character(alignment[1].seq):

so it looks like the first line must also be:

GAP = find_gap_character(alignment[0].seq)

instead of:

GAP = find_gap_character(a.seq)

if GAP != find_gap_character(alignment[1].seq):
raise ValueError("Different gap characters in sequence 'target' and 'mobile'.")
for ipos in range(alignment.get_alignment_length()):
aligned = list(alignment[:, ipos])
Expand Down
Loading