volkamerlab · pipaj97 · Jun 25, 2023 · Jun 25, 2023 · Jun 25, 2023 · Jan 30, 2024
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -20,7 +20,7 @@ jobs:
       fail-fast: false
       matrix:
         os: [macOS-latest, ubuntu-latest]
-        python-version: [3.7, 3.8]
+        python-version: ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12"]
     env:
       CI_OS: ${{ matrix.os }}
       PYVER: ${{ matrix.python-version }}

diff --git a/devtools/conda-envs/test_env.yaml b/devtools/conda-envs/test_env.yaml
@@ -18,7 +18,7 @@ dependencies:
   - tqdm
   # Chimera reimplementation
   - biotite
-  - biopython<=1.77
+  - biopython
   # mmligner
   - mmligner
   # theseus

diff --git a/devtools/conda-envs/user_env.yaml b/devtools/conda-envs/user_env.yaml
@@ -17,7 +17,7 @@ dependencies:
   - tqdm
   # Chimera reimplementation
   - biotite
-  - biopython<=1.77
+  - biopython
   # mmligner
   - mmligner
   # theseus

diff --git a/opencadd/databases/klifs/core.py b/opencadd/databases/klifs/core.py
@@ -140,13 +140,19 @@ def _standardize_column_values(dataframe):
 
         # TODO Use None instead of "-"; but may affect downstream pipelines that use "-" already
         if "structure.alternate_model" in dataframe.columns:
-            dataframe["structure.alternate_model"].replace("", "-", inplace=True)
+            dataframe["structure.alternate_model"] = dataframe[
+                "structure.alternate_model"
+            ].replace("", "-")
         if "ligand.expo_id" in dataframe.columns:
-            dataframe["ligand.expo_id"].replace(0, "-", inplace=True)
+            dataframe["ligand.expo_id"] = dataframe["ligand.expo_id"].replace(0, "-")
         if "ligand_allosteric.expo_id" in dataframe.columns:
-            dataframe["ligand_allosteric.expo_id"].replace(0, "-", inplace=True)
+            dataframe["ligand_allosteric.expo_id"] = dataframe[
+                "ligand_allosteric.expo_id"
+            ].replace(0, "-")
         if "structure.resolution" in dataframe.columns:
-            dataframe["structure.resolution"].replace(0, np.nan, inplace=True)
+            dataframe["structure.resolution"] = dataframe["structure.resolution"].replace(
+                0, np.nan
+            )
 
         # In case of drugs
         if "drug.brand_name" in dataframe.columns:

diff --git a/opencadd/databases/klifs/local.py b/opencadd/databases/klifs/local.py
@@ -184,9 +184,9 @@ def _from_klifs_overview_file(klifs_overview_path):
         )
 
         # Unify column 'alternate model' with corresponding column in KLIFS_export.csv
-        klifs_overview["structure.alternate_model"].replace(  # pylint: disable=E1136
-            " ", "-", inplace=True
-        )
+        klifs_overview["structure.alternate_model"] = klifs_overview[
+            "structure.alternate_model"
+        ].replace(" ", "-")
         # Drop column for kinase name; will be taken from KLIFS_export.csv file upon table merge
         klifs_overview.drop(columns=["kinase.klifs_name"], inplace=True)
 

diff --git a/opencadd/databases/klifs/remote.py b/opencadd/databases/klifs/remote.py
@@ -52,6 +52,8 @@ def __getstate__(self, *args):
 KLIFS_CLIENT = SerializableSwaggerClient.from_url(
     KLIFS_API_DEFINITIONS, config={"validate_responses": False}
 )
+# Maximum chunk size for input values for remote client endpoints
+N_CHUNK = 10_000
 
 
 class RemoteInitializer:
@@ -564,20 +566,28 @@ def all_interactions(self):
 
     def by_structure_klifs_id(self, structure_klifs_ids):
         structure_klifs_ids = self._ensure_list(structure_klifs_ids)
-        # Use KLIFS API
-        result = (
-            self._client.Interactions.get_interactions_get_IFP(structure_ID=structure_klifs_ids)
-            .response()
-            .result
-        )
-        # Convert list of ABC objects to DataFrame
-        interactions = self._abc_to_dataframe(result)
-        # Standardize DataFrame
-        interactions = self._standardize_dataframe(
-            interactions,
-            FIELDS.oc_name_to_type("interactions"),
-            FIELDS.remote_to_oc_names("interactions"),
-        )
+        interactions = []
+        # NOTE: Chunk IDs to avoid HTTPURITooLong
+        for i in range(0, len(structure_klifs_ids), N_CHUNK):
+            _structure_klifs_ids = structure_klifs_ids[i : i + N_CHUNK]
+            # Use KLIFS API
+            result = (
+                self._client.Interactions.get_interactions_get_IFP(
+                    structure_ID=_structure_klifs_ids
+                )
+                .response()
+                .result
+            )
+            # Convert list of ABC objects to DataFrame
+            _interactions = self._abc_to_dataframe(result)
+            # Standardize DataFrame
+            _interactions = self._standardize_dataframe(
+                _interactions,
+                FIELDS.oc_name_to_type("interactions"),
+                FIELDS.remote_to_oc_names("interactions"),
+            )
+            interactions.append(_interactions)
+        interactions = pd.concat(interactions)
         return interactions
 
     def by_ligand_klifs_id(self, ligand_klifs_ids):
@@ -886,20 +896,28 @@ def all_conformations(self):
 
     def by_structure_klifs_id(self, structure_klifs_ids):
         structure_klifs_ids = self._ensure_list(structure_klifs_ids)
-        # Use KLIFS API
-        result = (
-            self._client.Structures.get_structure_conformation(structure_ID=structure_klifs_ids)
-            .response()
-            .result
-        )
-        # Convert list of ABC objects to DataFrame
-        conformations = self._abc_to_dataframe(result)
-        # Standardize DataFrame
-        conformations = self._standardize_dataframe(
-            conformations,
-            FIELDS.oc_name_to_type("structure_conformations"),
-            FIELDS.remote_to_oc_names("structure_conformations"),
-        )
+        conformations = []
+        # NOTE: Chunk IDs to avoid HTTPURITooLong
+        for i in range(0, len(structure_klifs_ids), N_CHUNK):
+            _structure_klifs_ids = structure_klifs_ids[i : i + N_CHUNK]
+            # Use KLIFS API
+            result = (
+                self._client.Structures.get_structure_conformation(
+                    structure_ID=_structure_klifs_ids
+                )
+                .response()
+                .result
+            )
+            # Convert list of ABC objects to DataFrame
+            _conformations = self._abc_to_dataframe(result)
+            # Standardize DataFrame
+            _conformations = self._standardize_dataframe(
+                _conformations,
+                FIELDS.oc_name_to_type("structure_conformations"),
+                FIELDS.remote_to_oc_names("structure_conformations"),
+            )
+            conformations.append(_conformations)
+        conformations = pd.concat(conformations)
         return conformations
 
 

diff --git a/opencadd/structure/superposition/cli.py b/opencadd/structure/superposition/cli.py
@@ -1,6 +1,7 @@
 """
 Command Line Interface for opencadd
 """
+
 import argparse
 import logging
 

diff --git a/opencadd/structure/superposition/engines/base.py b/opencadd/structure/superposition/engines/base.py
@@ -1,6 +1,7 @@
 """
 Base class for all the superposition engines.
 """
+
 import logging
 
 _logger = logging.getLogger(__name__)

diff --git a/opencadd/structure/superposition/engines/mda.py b/opencadd/structure/superposition/engines/mda.py
@@ -19,7 +19,6 @@
 
 
 class MDAnalysisAligner(BaseAligner):
-
     """
     Factory to configure an aligner based on
     MDAnalysis' superposition algorithms.

diff --git a/opencadd/structure/superposition/engines/theseus.py b/opencadd/structure/superposition/engines/theseus.py
@@ -18,6 +18,7 @@
 * Accurate structural correlations from maximum likelihood superpositions.
   Theobald, Douglas L. & Wuttke, Deborah S. (2008) PLOS Computational Biology 4(2):e43
 """
+
 import os
 import subprocess
 import logging
@@ -33,7 +34,6 @@
 
 
 class TheseusAligner(BaseAligner):
-
     """
     Superpose structures with different sequences but similar structures
 

diff --git a/opencadd/structure/superposition/sequences.py b/opencadd/structure/superposition/sequences.py
@@ -1,6 +1,7 @@
 """
 Utilities for sequence alignment
 """
+
 import logging
 
 import numpy as np
@@ -77,6 +78,27 @@ def sequence_alignment(seq1: str, seq2: str, matrix: str, gap: int, local: bool
     return alignment[0]
 
 
+def find_gap_character(seq):
+    """
+    Finds the gap character of an alignment sequence
+
+    Args:
+        seq: str
+            aligned sequence
+
+    Returns:
+        gap character if found: str
+        None else
+    """
+    if "-" in seq:
+        return "-"
+    elif "=" in seq:
+        return "="
+    else:
+        return ""
+    # raise ValueError("No standard gap character found!")
+
+
 def fasta2select(
     fastafilename,
     ref_resids=None,
@@ -112,9 +134,8 @@ def fasta2select(
         dictionary with 'reference' and 'mobile' selection string
 
     """
-    protein_gapped = Bio.Alphabet.Gapped(Bio.Alphabet.IUPAC.protein)
     with open(fastafilename) as fasta:
-        alignment = Bio.AlignIO.read(fasta, "fasta", alphabet=protein_gapped)
+        alignment = Bio.AlignIO.read(fasta, "fasta")
 
     nseq = len(alignment)
     if nseq != 2:
@@ -127,7 +148,7 @@ def fasta2select(
         if orig_resids[iseq] is None:
             # build default: assume consecutive numbering of all
             # residues in the alignment
-            GAP = a.seq.alphabet.gap_char
+            GAP = find_gap_character(a.seq)
             length = len(a.seq) - a.seq.count(GAP)
             orig_resids[iseq] = np.arange(1, length + 1)
         else:
@@ -136,7 +157,7 @@ def fasta2select(
         if orig_segids[iseq] is None:
             # build default: assume consecutive numbering of all
             # residues in the alignment
-            GAP = a.seq.alphabet.gap_char
+            GAP = find_gap_character(a.seq)
             length = len(a.seq) - a.seq.count(GAP)
             orig_segids[iseq] = np.full(length, "")
         else:
@@ -175,7 +196,8 @@ def resid_factory(alignment, seq2resids):
         t = np.zeros((nseq, alignment.get_alignment_length()), dtype=int)
         s = np.zeros((nseq, alignment.get_alignment_length()), dtype=object)
         for iseq, a in enumerate(alignment):
-            GAP = a.seq.alphabet.gap_char
+            print(a.seq)
+            GAP = find_gap_character(a.seq)
             indices = np.cumsum(np.where(np.array(list(a.seq)) == GAP, 0, 1)) - 1
             t[iseq, :] = seq2resids[iseq][0][indices]
             s[iseq, :] = seq2resids[iseq][1][indices]
@@ -190,8 +212,8 @@ def resid(nseq, ipos, t=t, s=s):
     res_list = []  # collect individual selection string
 
     # should be the same for both seqs
-    GAP = alignment[0].seq.alphabet.gap_char
-    if GAP != alignment[1].seq.alphabet.gap_char:
+    GAP = find_gap_character(a.seq)
+    if GAP != find_gap_character(alignment[1].seq):
         raise ValueError("Different gap characters in sequence 'target' and 'mobile'.")
     for ipos in range(alignment.get_alignment_length()):
         aligned = list(alignment[:, ipos])
-Original file line number
+Diff line change
@@ Expand Up / @@ -19,7 +19,6 @@ @@
     class MDAnalysisAligner(BaseAligner):
         """
         Factory to configure an aligner based on
         MDAnalysis' superposition algorithms.
@@ Expand Down @@