Merge pull request #165 from pharmai/development

Release 2.4.0
pharmai · Dec 18, 2024 · 172ae02 · 172ae02
2 parents 82c803c + 979e56d
commit 172ae02
Show file tree

Hide file tree

Showing 10 changed files with 221 additions and 123 deletions.
diff --git a/.gitignore b/.gitignore
@@ -48,3 +48,4 @@ target/
 # Other
 *~
 .nfs*
+tests/
diff --git a/CHANGES.txt b/CHANGES.txt
@@ -1,5 +1,10 @@
 Changelog
 ---------
+# 2.4.0
+* new "--chains" flag to enable detection of interactions between protein chains by @PhiCMS and @snbolz
+* update setup.py, attempt to fix and install broken python openbabel bindings 3.1.1.1
+* update Dockerfile
+
 # 2.3.1
 * fixes in interaction detection with peptide ligands by @kalinni in #153
 * fix in hydrogen bond acceptor identification by @kalinni in #151

diff --git a/Dockerfile b/Dockerfile
@@ -1,86 +1,23 @@
-FROM ubuntu:18.04 AS builder
-
-ARG OPENBABEL_TAG="openbabel-3-0-0"
-ARG RDKIT_TAG="Release_2019_09_3"
-ARG MMTF_TAG="v1.0.0"
-ARG PYMOL_TAG="v2.3.0"
+FROM debian:stable
 
 LABEL maintainer="PharmAI GmbH <contact@pharm.ai>" \
         org.label-schema.name="PLIP: The Protein-Ligand Interaction Profiler" \
         org.label-schema.description="https://www.doi.org/10.1093/nar/gkv315"
 
+ENV DEBIAN_FRONTEND=noninteractive
 RUN apt-get update && apt-get install -y \
-    cmake \
-    git \
-    g++ \
-    imagemagick \
-    libboost-all-dev \
-    libeigen3-dev \
-    libfreetype6-dev \
-    libghc-zlib-dev \
-    libglew-dev \
-    libglm-dev \
-    libmsgpack-dev \
-    libnetcdf-dev \
-    libxml2-dev \
-    libpng-dev \
-    libpython3-all-dev \
-    python3 \
+    pymol \
+    python3-distutils \
     python3-lxml \
-    python3-numpy \
-    python3-pyqt5.qtopengl \
-    swig
-
-# build OpenBabel
-WORKDIR /src
-RUN git clone -b $OPENBABEL_TAG \
-    https://github.com/openbabel/openbabel.git
-WORKDIR /src/openbabel/build
-RUN cmake .. \
-    -DPYTHON_EXECUTABLE=/usr/bin/python3.6 \
-    -DPYTHON_BINDINGS=ON \
-    -DCMAKE_INSTALL_PREFIX=/usr/local \
-    -DRUN_SWIG=ON
-RUN make -j$(nproc --all) install
-
-# build RDkit
-#WORKDIR /src
-#RUN git clone -b $RDKIT_TAG \
-#    https://github.com/rdkit/rdkit.git
-#WORKDIR /src/rdkit/build
-#RUN cmake .. \
-#    -DCMAKE_BUILD_TYPE=Release \
-#    -DRDK_BUILD_INCHI_SUPPORT=ON \
-#    -DPYTHON_LIBRARY=/usr/lib/x86_64-linux-gnu/libpython3.6m.so \
-#    -DPYTHON_INCLUDE_DIR=/usr/include/python3.6 \
-#    -DPYTHON_EXECUTABLE=/usr/bin/python3.6
-#RUN make -j$(nproc --all) install
-#ENV RDBASE /src/rdkit
-#ENV LD_LIBRARY_PATH /usr/local/lib/:$RDBASE/lib
-#ENV PYTHONPATH $PYTHONPATH:$RDBASE
-
-# build mmtf-cpp
-WORKDIR /src
-RUN git clone -b $MMTF_TAG \
-    https://github.com/rcsb/mmtf-cpp.git
-WORKDIR /src/mmtf-cpp/build
-RUN cmake ..
-RUN make -j$(nproc --all) install
-
-# build PyMOL
-WORKDIR /src
-RUN git clone -b $PYMOL_TAG \
-    https://github.com/schrodinger/pymol-open-source
-WORKDIR /src/pymol-open-source
-RUN python3 setup.py build install \
-    --prefix=/usr/local \
-    --jobs=$(nproc --all)
+    python3-openbabel \
+    python3-pymol; \
+    apt-get clean && rm -rf /var/lib/apt/lists/*
 
 # copy PLIP source code
 WORKDIR /src
 ADD plip/ plip/
 RUN chmod +x plip/plipcmd.py
-ENV PYTHONPATH $PYTHONPATH:/src
+ENV PYTHONPATH=/src
 
 # execute tests
 WORKDIR /src/plip/test
@@ -89,4 +26,4 @@ RUN ./run_all_tests.sh
 WORKDIR /
 
 # set entry point to plipcmd.py
-ENTRYPOINT  ["python3", "/src/plip/plipcmd.py"]
+ENTRYPOINT  ["python3", "/src/plip/plipcmd.py"]
diff --git a/plip/basic/config.py b/plip/basic/config.py
@@ -1,4 +1,4 @@
-__version__ = '2.3.1'
+__version__ = '2.4.0'
 __maintainer__ = 'PharmAI GmbH (2020-2021) - www.pharm.ai - hello@pharm.ai'
 __citation_information__ = "Adasme,M. et al. PLIP 2021: expanding the scope of the protein-ligand interaction profiler to DNA and RNA. " \
                            "Nucl. Acids Res. (05 May 2021), gkab294. doi: 10.1093/nar/gkab294"
@@ -25,12 +25,15 @@
 NOFIXFILE = False  # Turn off writing to files for fixed PDB structures
 PEPTIDES = []  # Definition which chains should be considered as peptide ligands
 INTRA = None
+RESIDUES = {}
 KEEPMOD = False
 DNARECEPTOR = False
 OUTPUTFILENAME = "report"  # Naming for the TXT and XML report files
 NOPDBCANMAP = False  # Skip calculation of mapping canonical atom order: PDB atom order
 NOHYDRO = False  # Do not add hydrogen bonds (in case already present in the structure)
 MODEL = 1  # The model to be selected for multi-model structures (default = 1).
+CHAINS = None # Define chains for protein-protein interaction detection
+
 
 # Configuration file for Protein-Ligand Interaction Profiler (PLIP)
 # Set thresholds for detection of interactions

diff --git a/plip/basic/supplemental.py b/plip/basic/supplemental.py
@@ -55,6 +55,18 @@ def whichchain(atom):
     return atom.GetResidue().GetChain() if atom.GetResidue() is not None else None
 
 
+def residue_belongs_to_receptor(res, config):
+    """tests whether the residue is defined as receptor and is not part of a peptide or residue ligand."""
+    if config.CHAINS:
+        if config.CHAINS[0] and config.CHAINS[1]:  # if receptor and ligand chains were given
+            return res.GetChain() in config.CHAINS[0] and res.GetChain() not in config.CHAINS[1]
+            # True if residue is part of receptor chains and not of ligand chains
+        if config.CHAINS[1]:  # if only ligand chains were given
+            return res.GetChain() not in config.CHAINS[1]  # True if residue is not part of ligand chains
+        return False  # if only receptor chains were given or both is empty
+    return res.GetChain() not in config.PEPTIDES  # True if residue is not part of peptide ligand.
+
+
 #########################
 # Mathematical operations
 #########################

diff --git a/plip/plipcmd.py b/plip/plipcmd.py
@@ -10,6 +10,7 @@
 import multiprocessing
 import os
 import sys
+import ast
 from argparse import ArgumentParser
 from collections import namedtuple
 
@@ -38,9 +39,19 @@ def threshold_limiter(aparser, arg):
         aparser.error("All thresholds have to be values larger than zero.")
     return arg
 
+def residue_list(input_string):
+    """Parse mix of residue numbers and ranges passed with the --residues flag into one list"""
+    result = []
+    for part in input_string.split(','):
+        if '-' in part:
+            start, end = map(int, part.split('-'))
+            result.extend(range(start, end + 1))
+        else:
+            result.append(int(part))
+    return result
 
 def process_pdb(pdbfile, outpath, as_string=False, outputprefix='report'):
-    """Analysis of a single PDB file. Can generate textual reports XML, PyMOL session files and images as output."""
+    """Analysis of a single PDB file with optional chain filtering."""
     if not as_string:
         pdb_file_name = pdbfile.split('/')[-1]
         startmessage = f'starting analysis of {pdb_file_name}'
@@ -50,7 +61,6 @@ def process_pdb(pdbfile, outpath, as_string=False, outputprefix='report'):
     mol = PDBComplex()
     mol.output_path = outpath
     mol.load_pdb(pdbfile, as_string=as_string)
-    # @todo Offers possibility for filter function from command line (by ligand chain, position, hetid)
     for ligand in mol.ligands:
         mol.characterize_complex(ligand)
 
@@ -116,7 +126,7 @@ def remove_duplicates(slist):
     return unique
 
 
-def run_analysis(inputstructs, inputpdbids):
+def run_analysis(inputstructs, inputpdbids, chains=None):
     """Main function. Calls functions for processing, report generation and visualization."""
     pdbid, pdbpath = None, None
     # @todo For multiprocessing, implement better stacktracing for errors
@@ -127,16 +137,16 @@ def run_analysis(inputstructs, inputpdbids):
     output_prefix = config.OUTPUTFILENAME
 
     if inputstructs is not None:  # Process PDB file(s)
-        num_structures = len(inputstructs)
+        num_structures = len(inputstructs)  # @question: how can it become more than one file? The tilde_expansion function does not consider this case.
         inputstructs = remove_duplicates(inputstructs)
         read_from_stdin = False
         for inputstruct in inputstructs:
-            if inputstruct == '-':
+            if inputstruct == '-':  # @expl: when user gives '-' as input, pdb file is read from stdin
                 inputstruct = sys.stdin.read()
                 read_from_stdin = True
                 if config.RAWSTRING:
                     if sys.version_info < (3,):
-                        inputstruct = bytes(inputstruct).decode('unicode_escape')
+                        inputstruct = bytes(inputstruct).decode('unicode_escape')  # @expl: in Python2, the bytes object is just a string.
                     else:
                         inputstruct = bytes(inputstruct, 'utf8').decode('unicode_escape')
             else:
@@ -218,6 +228,9 @@ def main():
                             help="Allows to define one or multiple chains as peptide ligands or to detect inter-chain contacts",
                             nargs="+")
     ligandtype.add_argument("--intra", dest="intra", help="Allows to define one chain to analyze intra-chain contacts.")
+    parser.add_argument("--residues", dest="residues", default=[], nargs="+",
+                        help="""Allows to specify which residues of the chain(s) should be considered as peptide ligands.
+                        Give single residues (separated with comma) or ranges (with dash) or both, for several chains separate selections with one space""")
     parser.add_argument("--keepmod", dest="keepmod", default=False,
                         help="Keep modified residues as ligands",
                         action="store_true")
@@ -241,7 +254,19 @@ def main():
     for t in thresholds:
         parser.add_argument('--%s' % t.name, dest=t.name, type=lambda val: threshold_limiter(parser, val),
                             help=argparse.SUPPRESS)
+
+    # Add argument to define receptor and ligand chains
+    parser.add_argument("--chains", dest="chains", type=str,
+                        help="Specify chains as receptor/ligand groups, e.g., '[['A'], ['B']]'. "
+                             "Use format [['A'], ['B', 'C']] to define A as receptor, and B, C as ligands.")
+
+
     arguments = parser.parse_args()
+    # make sure, residues is only used together with --inter (could be expanded to --intra in the future)
+    if arguments.residues and not (arguments.peptides or arguments.intra):
+        parser.error("The --residues option requires specification of a chain with --inter or --peptide")
+    if arguments.residues and len(arguments.residues)!=len(arguments.peptides):
+        parser.error("Please provide residue numbers or ranges for each chain specified. Separate selections with a single space.")
     # configure log levels
     config.VERBOSE = True if arguments.verbose else False
     config.QUIET = True if arguments.quiet else False
@@ -268,6 +293,7 @@ def main():
     config.BREAKCOMPOSITE = arguments.breakcomposite
     config.ALTLOC = arguments.altlocation
     config.PEPTIDES = arguments.peptides
+    config.RESIDUES = dict(zip(arguments.peptides, map(residue_list, arguments.residues)))
     config.INTRA = arguments.intra
     config.NOFIX = arguments.nofix
     config.NOFIXFILE = arguments.nofixfile
@@ -277,6 +303,22 @@ def main():
     config.OUTPUTFILENAME = arguments.outputfilename
     config.NOHYDRO = arguments.nohydro
     config.MODEL = arguments.model
+
+    try:
+        # add inner quotes for python backend
+        if not arguments.chains:
+            config.CHAINS = None
+        else:
+            import re
+            quoted_input = re.sub(r'(?<!["\'])\b([a-zA-Z0-9_]+)\b(?!["\'])', r'"\1"', arguments.chains)
+            config.CHAINS = ast.literal_eval(quoted_input)
+        print(config.CHAINS)
+        if config.CHAINS and not all(isinstance(c, list) for c in config.CHAINS):
+            raise ValueError("Chains should be specified as a list of lists, e.g., '[[A], [B, C]]'.")
+    except (ValueError, SyntaxError):
+        parser.error("The --chains option must be in the format '[[A], [B, C]]'.")
+
+
     # Make sure we have pymol with --pics and --pymol
     if config.PICS or config.PYMOL:
         try:

diff --git a/plip/structure/preparation.py b/plip/structure/preparation.py
@@ -14,6 +14,7 @@
 from plip.basic.supplemental import extract_pdbid, read_pdb, create_folder_if_not_exists, canonicalize
 from plip.basic.supplemental import read, nucleotide_linkage, sort_members_by_importance
 from plip.basic.supplemental import whichchain, whichrestype, whichresnumber, euclidean3d, int32_to_negative
+from plip.basic.supplemental import residue_belongs_to_receptor
 from plip.structure.detection import halogen, pication, water_bridges, metal_complexation
 from plip.structure.detection import hydrophobic_interactions, pistacking, hbonds, saltbridge
 
@@ -256,7 +257,7 @@ def getligs(self):
         Returns all non-empty ligands.
         """
 
-        if config.PEPTIDES == [] and config.INTRA is None:
+        if config.PEPTIDES == [] and config.INTRA is None and config.CHAINS is None:
             # Extract small molecule ligands (default)
             ligands = []
 
@@ -284,8 +285,16 @@ def getligs(self):
         else:
             # Extract peptides from given chains
             self.water = [o for o in pybel.ob.OBResidueIter(self.proteincomplex.OBMol) if o.GetResidueProperty(9)]
-            if config.PEPTIDES:
+            if config.PEPTIDES and not config.CHAINS:
                 peptide_ligands = [self.getpeptides(chain) for chain in config.PEPTIDES]
+
+            #Todo: Validate change here... Do we want to combine multiple chains to a single ligand?
+            # if yes can be easily added to the getpeptides function by flatten the resulting list - Philipp
+            elif config.CHAINS:
+                # chains is defined as list of list e.g. [['A'], ['B', 'C']] in which second list contains the
+                # ligand chains and the first one should be the receptor
+                peptide_ligands = [self.getpeptides(chain) for chain in config.CHAINS[1]]
+
             elif config.INTRA is not None:
                 peptide_ligands = [self.getpeptides(config.INTRA), ]
 
@@ -304,7 +313,7 @@ def extract_ligand(self, kmer):
         names = [x[0] for x in members]
         longname = '-'.join([x[0] for x in members])
 
-        if config.PEPTIDES:
+        if config.PEPTIDES or config.CHAINS:
             ligtype = 'PEPTIDE'
         elif config.INTRA is not None:
             ligtype = 'INTRA'
@@ -743,7 +752,7 @@ def refine_hydrophobic(all_h, pistacks):
         hydroph = [h for h in sel2.values()]
         hydroph_final = []
         #  3. If a protein atom interacts with several neighboring ligand atoms, just keep the one with the closest dist
-        if config.PEPTIDES or config.INTRA:
+        if config.PEPTIDES or config.INTRA or config.CHAINS:
             # the ligand also consists of amino acid residues, repeat step 2 just the other way around
             sel3 = {}
             for h in hydroph:
@@ -951,7 +960,7 @@ def find_charged(self, mol):
         data = namedtuple('pcharge', 'atoms atoms_orig_idx type center restype resnr reschain')
         a_set = []
         # Iterate through all residue, exclude those in chains defined as peptides
-        for res in [r for r in pybel.ob.OBResidueIter(mol.OBMol) if not r.GetChain() in config.PEPTIDES]:
+        for res in [r for r in pybel.ob.OBResidueIter(mol.OBMol) if residue_belongs_to_receptor(r, config)]:
             if config.INTRA is not None:
                 if res.GetChain() != config.INTRA:
                     continue
@@ -992,7 +1001,7 @@ def find_charged(self, mol):
                         a_contributing.append(pybel.Atom(a))
                         a_contributing_orig_idx.append(self.Mapper.mapid(a.GetIdx(), mtype='protein'))
                 if not len(a_contributing) == 0:
-                    a_set.append(data(atoms=a_contributing,atoms_orig_idx=a_contributing_orig_idx, type='negative', 
+                    a_set.append(data(atoms=a_contributing,atoms_orig_idx=a_contributing_orig_idx, type='negative',
                                       center=centroid([ac.coords for ac in a_contributing]), restype=res.GetName(),
                                       resnr=res.GetNum(),
                                       reschain=res.GetChain()))
@@ -1051,7 +1060,7 @@ def __init__(self, cclass, ligand):
         self.complex = cclass
         self.molecule = ligand.mol  # Pybel Molecule
         # get canonical SMILES String, but not for peptide ligand (tend to be too long -> openBabel crashes)
-        self.smiles = "" if (config.INTRA or config.PEPTIDES) else self.molecule.write(format='can')
+        self.smiles = "" if (config.INTRA or config.PEPTIDES or config.CHAINS) else self.molecule.write(format='can')
         self.inchikey = self.molecule.write(format='inchikey')
         self.can_to_pdb = ligand.can_to_pdb
         if not len(self.smiles) == 0:
@@ -1192,7 +1201,7 @@ def find_charged(self, all_atoms):
         """
         data = namedtuple('lcharge', 'atoms orig_atoms atoms_orig_idx type center fgroup')
         a_set = []
-        if not (config.INTRA or config.PEPTIDES):
+        if not (config.INTRA or config.PEPTIDES or config.CHAINS):
             for a in all_atoms:
                 a_orig_idx = self.Mapper.mapid(a.idx, mtype=self.mtype, bsid=self.bsid)
                 a_orig = self.Mapper.id_to_atom(a_orig_idx)
@@ -1567,9 +1576,9 @@ def res_belongs_to_bs(res, cutoff, ligcentroid):
         rescentroid = centroid([(atm.x(), atm.y(), atm.z()) for atm in pybel.ob.OBResidueAtomIter(res)])
         # Check geometry
         near_enough = True if euclidean3d(rescentroid, ligcentroid) < cutoff else False
-        # Check chain membership
-        restricted_chain = True if res.GetChain() in config.PEPTIDES else False
-        return (near_enough and not restricted_chain)
+        #Todo: Test if properly working
+        # Add restriction via chains flag
+        return near_enough and residue_belongs_to_receptor(res, config)
 
     def get_atom(self, idx):
         return self.atoms[idx]
@@ -1580,4 +1589,4 @@ def output_path(self):
 
     @output_path.setter
     def output_path(self, path):
-        self._output_path = tilde_expansion(path)
+        self._output_path = tilde_expansion(path)
-Original file line number
+Diff line change
@@ Expand Up / @@ -48,3 +48,4 @@ target/ @@
     # Other
     *~
     .nfs*
+    tests/