Merge pull request #47 from Habacef/main

Allow user to select & configure stable water-analysis during setup
wolberlab · Dec 15, 2023 · 3416a17 · 3416a17
2 parents 2242d69 + 19c2912
commit 3416a17
Show file tree

Hide file tree

Showing 4 changed files with 98 additions and 51 deletions.
diff --git a/openmmdl/openmmdl_analysis/find_stable_waters.py b/openmmdl/openmmdl_analysis/find_stable_waters.py
@@ -9,14 +9,17 @@
 
 # the following function is called by the last function in this script. It is only to be run if the water was already calculated.
 # finding the water molecules is the campute intensive task, thus i splitted it in the two subtasks.
-def perform_clustering_and_writing(stable_waters, cluster_eps, min_samples, output_directory):
-    def write_pdb_clusters_and_representatives(clustered_waters):
+def perform_clustering_and_writing(stable_waters, cluster_eps, total_frames, output_directory):
+    def write_pdb_clusters_and_representatives(clustered_waters, min_samples, output_directory):
         atom_counter = 1
         pdb_file_counter = 1
         print("cluster_eps:")
         print(cluster_eps)
         print("minsamples:")
         print(min_samples)
+        sub_output_directory = output_directory + "/clusterSize"
+        sub_output_directory += str(min_samples)
+        os.makedirs(sub_output_directory, exist_ok=True)
         with pd.option_context('display.max_rows', None):  # Temporarily set display options
             for label, cluster in clustered_waters.groupby("Cluster_Label"):
                 pdb_lines = []
@@ -29,7 +32,7 @@ def write_pdb_clusters_and_representatives(clustered_waters):
                     atom_counter += 1
 
                 # Write the current cluster to a new PDB file
-                output_filename = os.path.join(output_directory, f"cluster_{label}.pdb")
+                output_filename = os.path.join(sub_output_directory, f"cluster_{label}.pdb")
                 with open(output_filename, "w") as pdb_file:
                     pdb_file.write("".join(pdb_lines))
                     print(f"Cluster {label} written")
@@ -39,7 +42,7 @@ def write_pdb_clusters_and_representatives(clustered_waters):
         # Write representative water molecules to a PDB file
         representative_waters = clustered_waters.groupby("Cluster_Label").mean()
         representative_waters.reset_index(inplace=True)
-        representative_filename = "representative_waters.pdb"
+        representative_filename = os.path.join(sub_output_directory, "representative_waters.pdb")
         with open(representative_filename, "w") as pdb_file:
             for index, row in representative_waters.iterrows():
                 x, y, z = row["Oxygen_X"], row["Oxygen_Y"], row["Oxygen_Z"]
@@ -48,27 +51,37 @@ def write_pdb_clusters_and_representatives(clustered_waters):
 
     # Feature extraction: XYZ coordinates
     X = stable_waters[["Oxygen_X", "Oxygen_Y", "Oxygen_Z"]]
+
+    # List of percentages to iterate over
+    percentage_values = [25, 50, 75, 90, 99]
 
-    # Apply DBSCAN clustering
-    dbscan = DBSCAN(eps=cluster_eps, min_samples=min_samples)
-    labels = dbscan.fit_predict(X)
+    # Assuming total_frames and X are defined outside of this snippet
+    for percent in percentage_values:
+       # Adjust min_percent based on the current iteration
+        min_percent = percent / 100
 
-    # Filter out noise (clusters with fewer than a threshold number of points)
-    clustered_waters = stable_waters.copy()
-    clustered_waters["Cluster_Label"] = labels
-    clustered_waters = clustered_waters[clustered_waters["Cluster_Label"] != -1]  # Remove noise
+        # Rest of your code remains the same
+        min_samples = int(min_percent * total_frames)
+        dbscan = DBSCAN(eps=cluster_eps, min_samples=min_samples)
+        labels = dbscan.fit_predict(X)
 
-    # Call the writing function
-    write_pdb_clusters_and_representatives(clustered_waters)
+        # Filter out noise and call the writing function
+        clustered_waters = stable_waters.copy()
+        clustered_waters["Cluster_Label"] = labels
+        clustered_waters = clustered_waters[clustered_waters["Cluster_Label"] != -1]  # Remove noise
 
-def process_trajectory_and_cluster(topology, trajectory, cluster_eps=1.0, min_percent=0.45, output_directory="./stableWaters"):
+        # Call the writing function
+        write_pdb_clusters_and_representatives(clustered_waters, min_samples, output_directory)
+
+def process_trajectory_and_cluster(topology, trajectory, water_eps, output_directory="./stableWaters"):
     # Load the PDB and DCD files
     u = mda.Universe(topology, trajectory)
+    output_directory += "_clusterEps_"
+    strEps = str(water_eps).replace(".", "")
+    output_directory += strEps
     os.makedirs(output_directory, exist_ok=True)
     # Get the total number of frames for the progress bar
     total_frames = len(u.trajectory)
-    min_samples = min_percent*total_frames
-    min_samples = int(min_samples)
     # Create an empty DataFrame to store stable water coordinates
     stable_waters = pd.DataFrame(columns=["Frame", "Residue", "Oxygen_X", "Oxygen_Y", "Oxygen_Z"])
 
@@ -110,10 +123,10 @@ def process_trajectory_and_cluster(topology, trajectory, cluster_eps=1.0, min_pe
     stable_waters.to_csv(os.path.join(output_directory, "stable_waters.csv"), index=False)
 
     # Call the clustering and writing function with the stable_waters DataFrame and output directory
-    perform_clustering_and_writing(stable_waters, cluster_eps, min_samples, output_directory)
+    perform_clustering_and_writing(stable_waters, water_eps, total_frames, output_directory)
 
 # Call the function with the desired water type and specify the output directory
-# process_trajectory_and_cluster("your_topology.pdb", "your_trajectory.dcd", cluster_eps=1.0, min_samples=1500, output_directory=".")
+# process_trajectory_and_cluster("your_topology.pdb", "your_trajectory.dcd", water_eps=1.0, min_samples=1500, output_directory=".")
 
 
 
@@ -179,13 +192,21 @@ def read_pdb_as_dataframe(pdb_file):
     return representative_waters
 
 # Encapsulate the code in a function
-def analyze_protein_and_water_interaction(protein_pdb_file, representative_waters_file, distance_threshold=5.0):
-    representative_waters = read_pdb_as_dataframe(representative_waters_file)
-    filtered_structure = filter_and_parse_pdb(protein_pdb_file)
-    interacting_residues = find_interacting_residues(filtered_structure, representative_waters, distance_threshold)
-
-    result_df = pd.DataFrame(interacting_residues.items(), columns=['Cluster_Number', 'Interacting_Residues'])
+def analyze_protein_and_water_interaction(protein_pdb_file, representative_waters_file, cluster_eps, output_directory="./stableWaters", distance_threshold=5.0):
+    output_directory += "_clusterEps_"
+    strEps = str(cluster_eps).replace(".", "")
+    output_directory += strEps
 
-    # Export to CSV
-    result_df.to_csv("interacting_residues.csv", index=False)
-    print("Exported interacting_residues.csv")
+    # Iterate over subdirectories
+    for subdirectory in os.listdir(output_directory):
+        subdirectory_path = os.path.join(output_directory, subdirectory)
+        if os.path.isdir(subdirectory_path):
+            # Perform operations within each subdirectory
+            representative_waters = read_pdb_as_dataframe(os.path.join(subdirectory_path, representative_waters_file))
+            filtered_structure = filter_and_parse_pdb(protein_pdb_file)
+            interacting_residues = find_interacting_residues(filtered_structure, representative_waters, distance_threshold)
+            result_df = pd.DataFrame(interacting_residues.items(), columns=['Cluster_Number', 'Interacting_Residues'])
+            # Save result to each subdirectory
+            result_df.to_csv(os.path.join(subdirectory_path, "interacting_residues.csv"), index=False)
+            print(f"Exported interacting_residues.csv in {subdirectory_path}")
+
diff --git a/openmmdl/openmmdl_analysis/openmmdlanalysis.py b/openmmdl/openmmdl_analysis/openmmdlanalysis.py
@@ -66,6 +66,8 @@ def main():
     parser.add_argument('-nuc', dest='receptor_nucleic', help='Treat nucleic acids as receptor', default=False)
     parser.add_argument('-s', dest='special_ligand', help='Calculate interactions with special ligands', default=None)
     parser.add_argument('-pep', dest='peptide', help='Calculate interactions with peptides. Give the peptides chain name as input. Defaults to None', default=None)
+    parser.add_argument('-w', dest='stable_water_analysis', help='Should stable water analysis be performed? True or False', default=False)
+    parser.add_argument('--watereps', dest='water_eps', help='Set the Eps for clustering, this defines how big clusters can be spatially in Angstrom', default=1.0)
 
     input_formats = ['.pdb', '.dcd', '.sdf', '.csv'] 
     args = parser.parse_args()
@@ -78,11 +80,14 @@ def main():
     topology = args.topology
     trajectory = args.trajectory
 
-    if not args.ligand_sdf and  args.peptide == None:
+    water_eps = float(args.water_eps)
+    stable_water_analysis = bool(args.stable_water_analysis)
+    # The following is the current water analysis if no ligand is present. 
+    if not args.ligand_sdf and args.peptide == None and stable_water_analysis:
         print("All analyses will be run which can be done without a ligand present")
         #...
-        process_trajectory_and_cluster(topology, trajectory)
-        analyze_protein_and_water_interaction(topology,"representative_waters.pdb")
+        process_trajectory_and_cluster(topology, trajectory, water_eps)
+        analyze_protein_and_water_interaction(topology,"representative_waters.pdb", water_eps)
         sys.exit()   
 
 
@@ -508,6 +513,11 @@ def main():
         print("\033[1mPharmacophores generated\033[0m")
 
     print("\033[1mAnalysis is Finished.\033[0m")
+
+    if stable_water_analysis:
+        process_trajectory_and_cluster(topology, trajectory, water_eps)
+        analyze_protein_and_water_interaction(topology,"representative_waters.pdb")
+
 
 
 if __name__ == "__main__":