From 6109a611c5eb6552fecf0daed5f8bb0b9a76de75 Mon Sep 17 00:00:00 2001 From: Leon Obendorf <44522423+Habacef@users.noreply.github.com> Date: Fri, 15 Dec 2023 11:52:32 +0100 Subject: [PATCH 1/2] Water-analysis control during setup Added the option to control stable-water analysis during setup. Also optimized the output folder structure of the analysis and set to analyze water-clusters on different amount of cluster-sizes (water pesent in 25%, 50%, 75%, 90% and 99% of the MD) --- .../openmmdl_analysis/find_stable_waters.py | 55 ++++++++++++------- .../openmmdl_analysis/openmmdlanalysis.py | 16 +++++- openmmdl/openmmdl_setup/openmmdlsetup.py | 37 +++++++------ .../templates/simulationOptions.html | 21 +++++-- 4 files changed, 86 insertions(+), 43 deletions(-) diff --git a/openmmdl/openmmdl_analysis/find_stable_waters.py b/openmmdl/openmmdl_analysis/find_stable_waters.py index ef221a27..846b7ba3 100644 --- a/openmmdl/openmmdl_analysis/find_stable_waters.py +++ b/openmmdl/openmmdl_analysis/find_stable_waters.py @@ -9,8 +9,8 @@ # the following function is called by the last function in this script. It is only to be run if the water was already calculated. # finding the water molecules is the campute intensive task, thus i splitted it in the two subtasks. -def perform_clustering_and_writing(stable_waters, cluster_eps, min_samples, output_directory): - def write_pdb_clusters_and_representatives(clustered_waters): +def perform_clustering_and_writing(stable_waters, cluster_eps, output_directory): + def write_pdb_clusters_and_representatives(clustered_waters, min_samples): atom_counter = 1 pdb_file_counter = 1 print("cluster_eps:") @@ -29,6 +29,9 @@ def write_pdb_clusters_and_representatives(clustered_waters): atom_counter += 1 # Write the current cluster to a new PDB file + output_directory += "_" + output_directory += min_samples + os.makedirs(output_directory, exist_ok=True) output_filename = os.path.join(output_directory, f"cluster_{label}.pdb") with open(output_filename, "w") as pdb_file: pdb_file.write("".join(pdb_lines)) @@ -39,7 +42,7 @@ def write_pdb_clusters_and_representatives(clustered_waters): # Write representative water molecules to a PDB file representative_waters = clustered_waters.groupby("Cluster_Label").mean() representative_waters.reset_index(inplace=True) - representative_filename = "representative_waters.pdb" + representative_filename = os.path.join(output_directory, "representative_waters.pdb") with open(representative_filename, "w") as pdb_file: for index, row in representative_waters.iterrows(): x, y, z = row["Oxygen_X"], row["Oxygen_Y"], row["Oxygen_Z"] @@ -48,27 +51,37 @@ def write_pdb_clusters_and_representatives(clustered_waters): # Feature extraction: XYZ coordinates X = stable_waters[["Oxygen_X", "Oxygen_Y", "Oxygen_Z"]] + + # List of percentages to iterate over + percentage_values = [25, 50, 75, 90, 99] - # Apply DBSCAN clustering - dbscan = DBSCAN(eps=cluster_eps, min_samples=min_samples) - labels = dbscan.fit_predict(X) + # Assuming total_frames and X are defined outside of this snippet + for percent in percentage_values: + # Adjust min_percent based on the current iteration + min_percent = percent / 100 - # Filter out noise (clusters with fewer than a threshold number of points) - clustered_waters = stable_waters.copy() - clustered_waters["Cluster_Label"] = labels - clustered_waters = clustered_waters[clustered_waters["Cluster_Label"] != -1] # Remove noise + # Rest of your code remains the same + min_samples = int(min_percent * total_frames) + dbscan = DBSCAN(eps=cluster_eps, min_samples=min_samples) + labels = dbscan.fit_predict(X) - # Call the writing function - write_pdb_clusters_and_representatives(clustered_waters) + # Filter out noise and call the writing function + clustered_waters = stable_waters.copy() + clustered_waters["Cluster_Label"] = labels + clustered_waters = clustered_waters[clustered_waters["Cluster_Label"] != -1] # Remove noise -def process_trajectory_and_cluster(topology, trajectory, cluster_eps=1.0, min_percent=0.45, output_directory="./stableWaters"): + # Call the writing function + write_pdb_clusters_and_representatives(clustered_waters, min_samples) + +def process_trajectory_and_cluster(topology, trajectory, water_eps, output_directory="./stableWaters"): # Load the PDB and DCD files u = mda.Universe(topology, trajectory) + output_directory += "_clusterEps_" + strEps = str(water_eps).replace(".", "_") + output_directory += strEps os.makedirs(output_directory, exist_ok=True) # Get the total number of frames for the progress bar total_frames = len(u.trajectory) - min_samples = min_percent*total_frames - min_samples = int(min_samples) # Create an empty DataFrame to store stable water coordinates stable_waters = pd.DataFrame(columns=["Frame", "Residue", "Oxygen_X", "Oxygen_Y", "Oxygen_Z"]) @@ -110,10 +123,10 @@ def process_trajectory_and_cluster(topology, trajectory, cluster_eps=1.0, min_pe stable_waters.to_csv(os.path.join(output_directory, "stable_waters.csv"), index=False) # Call the clustering and writing function with the stable_waters DataFrame and output directory - perform_clustering_and_writing(stable_waters, cluster_eps, min_samples, output_directory) + perform_clustering_and_writing(stable_waters, water_eps, total_frames, output_directory) # Call the function with the desired water type and specify the output directory -# process_trajectory_and_cluster("your_topology.pdb", "your_trajectory.dcd", cluster_eps=1.0, min_samples=1500, output_directory=".") +# process_trajectory_and_cluster("your_topology.pdb", "your_trajectory.dcd", water_eps=1.0, min_samples=1500, output_directory=".") @@ -179,7 +192,7 @@ def read_pdb_as_dataframe(pdb_file): return representative_waters # Encapsulate the code in a function -def analyze_protein_and_water_interaction(protein_pdb_file, representative_waters_file, distance_threshold=5.0): +def analyze_protein_and_water_interaction(protein_pdb_file, representative_waters_file, cluster_eps, output_directory="./stableWaters", distance_threshold=5.0): representative_waters = read_pdb_as_dataframe(representative_waters_file) filtered_structure = filter_and_parse_pdb(protein_pdb_file) interacting_residues = find_interacting_residues(filtered_structure, representative_waters, distance_threshold) @@ -187,5 +200,9 @@ def analyze_protein_and_water_interaction(protein_pdb_file, representative_water result_df = pd.DataFrame(interacting_residues.items(), columns=['Cluster_Number', 'Interacting_Residues']) # Export to CSV - result_df.to_csv("interacting_residues.csv", index=False) + output_directory += "_clusterEps_" + strEps = str(cluster_eps).replace(".", "_") + output_directory += strEps + result_df.to_csv(os.path.join(output_directory, "interacting_residues.csv"), index=False) print("Exported interacting_residues.csv") + diff --git a/openmmdl/openmmdl_analysis/openmmdlanalysis.py b/openmmdl/openmmdl_analysis/openmmdlanalysis.py index 30cef20b..ed6ca187 100644 --- a/openmmdl/openmmdl_analysis/openmmdlanalysis.py +++ b/openmmdl/openmmdl_analysis/openmmdlanalysis.py @@ -62,6 +62,8 @@ def main(): parser.add_argument('-nuc', dest='receptor_nucleic', help='Treat nucleic acids as receptor', default=False) parser.add_argument('-s', dest='special_ligand', help='Calculate interactions with special ligands', default=None) parser.add_argument('-pep', dest='peptide', help='Calculate interactions with peptides. Give the peptides chain name as input. Defaults to None', default=None) + parser.add_argument('-w', dest='stable_water_analysis', help='Should stable water analysis be performed? True or False', default=False) + parser.add_argument('--watereps', dest='water_eps', help='Set the Eps for clustering, this defines how big clusters can be spatially in Angstrom', default=1.0) input_formats = ['.pdb', '.dcd', '.sdf', '.csv'] args = parser.parse_args() @@ -74,11 +76,14 @@ def main(): topology = args.topology trajectory = args.trajectory - if not args.ligand_sdf and args.peptide == None: + water_eps = float(args.water_eps) + stable_water_analysis = bool(args.stable_water_analysis) + # The following is the current water analysis if no ligand is present. + if not args.ligand_sdf and args.peptide == None and stable_water_analysis: print("All analyses will be run which can be done without a ligand present") #... - process_trajectory_and_cluster(topology, trajectory) - analyze_protein_and_water_interaction(topology,"representative_waters.pdb") + process_trajectory_and_cluster(topology, trajectory, water_eps) + analyze_protein_and_water_interaction(topology,"representative_waters.pdb", water_eps) sys.exit() @@ -495,6 +500,11 @@ def main(): print("\033[1mPharmacophores generated\033[0m") print("\033[1mAnalysis is Finished.\033[0m") + + if stable_water_analysis: + process_trajectory_and_cluster(topology, trajectory, water_eps) + analyze_protein_and_water_interaction(topology,"representative_waters.pdb") + if __name__ == "__main__": diff --git a/openmmdl/openmmdl_setup/openmmdlsetup.py b/openmmdl/openmmdl_setup/openmmdlsetup.py index 67680125..7eb73e0f 100644 --- a/openmmdl/openmmdl_setup/openmmdlsetup.py +++ b/openmmdl/openmmdl_setup/openmmdlsetup.py @@ -821,7 +821,9 @@ def configureDefaultOptions(): session['binding_mode'] = '40' session['min_transition'] = '1' session['rmsd_diff'] = 'No' - session['pml_generation'] = 'True' + session['pml_generation'] = 'True' + session['stable_water'] = 'Yes' + session['wc_distance'] = '1.0' if session['fileType'] == 'pdb' and session['waterModel'] == 'implicit': implicitWater = True session['ensemble'] = 'nvt' if implicitWater else 'npt' @@ -1283,6 +1285,7 @@ def _xml_script_segment(to_serialize, target_file): lines = [line.format(filename=session['finalStateFilename']) for line in state_script] script.extend(lines) + if session ['md_postprocessing'] == 'True': if fileType == "pdb": script.append("mdtraj_conversion(f'Equilibration_{protein}', '%s')" % session['mdtraj_output']) @@ -1336,47 +1339,47 @@ def _xml_script_segment(to_serialize, target_file): script.append("os.chdir('Final_Output/All_Atoms')") if fileType == "pdb": if session['sdfFile']: - script.append("analysis_run_command = 'openmmdl_analysis -t centered_top%s -d centered_traj%s -l %s -n UNK -b %s -m %s -r %s -p %s' " % (top_ext, traj_ext, session['sdfFile'], session['binding_mode'], session['min_transition'], session['rmsd_diff'], session['pml_generation'])) + script.append("analysis_run_command = 'openmmdl_analysis -t centered_top%s -d centered_traj%s -l %s -n UNK -b %s -m %s -r %s -p %s -w %s --watereps %s' " % (top_ext, traj_ext, session['sdfFile'], session['binding_mode'], session['min_transition'], session['rmsd_diff'], session['pml_generation'], session['stable_water'], session['wc_distance'])) elif session['sdfFile'] == '': - script.append("analysis_run_command = 'openmmdl_analysis -t centered_top%s -d centered_traj%s -b %s -m %s -r %s -p %s' " % (top_ext, traj_ext, session['binding_mode'], session['min_transition'], session['rmsd_diff'], session['pml_generation'])) + script.append("analysis_run_command = 'openmmdl_analysis -t centered_top%s -d centered_traj%s -b %s -m %s -r %s -p %s -w %s --watereps %s' " % (top_ext, traj_ext, session['binding_mode'], session['min_transition'], session['rmsd_diff'], session['pml_generation'], session['stable_water'], session['wc_distance'])) elif fileType == "amber": if session['nmLig'] or session['spLig']: - script.append("analysis_run_command = 'openmmdl_analysis -t centered_top%s -d centered_traj%s -l %s.sdf -n %s -b %s -m %s -r %s -p %s' " % (top_ext, traj_ext, nmLigFileName[:-4], nmLigName, session['binding_mode'], session['min_transition'], session['rmsd_diff'], session['pml_generation'])) + script.append("analysis_run_command = 'openmmdl_analysis -t centered_top%s -d centered_traj%s -l %s.sdf -n %s -b %s -m %s -r %s -p %s --watereps %s' " % (top_ext, traj_ext, nmLigFileName[:-4], nmLigName, session['binding_mode'], session['min_transition'], session['rmsd_diff'], session['pml_generation'], session['stable_water'], session['wc_distance'])) else: - script.append("analysis_run_command = 'openmmdl_analysis -t centered_top%s -d centered_traj%s -b %s -m %s -r %s -p %s' " % (top_ext, traj_ext, session['binding_mode'], session['min_transition'], session['rmsd_diff'], session['pml_generation'])) + script.append("analysis_run_command = 'openmmdl_analysis -t centered_top%s -d centered_traj%s -b %s -m %s -r %s -p %s -w %s --watereps %s' " % (top_ext, traj_ext, session['binding_mode'], session['min_transition'], session['rmsd_diff'], session['pml_generation'], session['stable_water'], session['wc_distance'])) elif session['analysis_selection'] == 'analysis_prot_lig': script.append("os.chdir('Final_Output/Prot_Lig')") if fileType == "pdb": if session['sdfFile']: - script.append("analysis_run_command = 'openmmdl_analysis -t prot_lig_top%s -d prot_lig_traj%s -l %s -n UNK -b %s -m %s -r %s -p %s' " % (top_ext, traj_ext, session['sdfFile'], session['binding_mode'], session['min_transition'], session['rmsd_diff'], session['pml_generation'])) + script.append("analysis_run_command = 'openmmdl_analysis -t prot_lig_top%s -d prot_lig_traj%s -l %s -n UNK -b %s -m %s -r %s -p %s -w %s --watereps %s' " % (top_ext, traj_ext, session['sdfFile'], session['binding_mode'], session['min_transition'], session['rmsd_diff'], session['pml_generation'], session['stable_water'], session['wc_distance'])) elif session['sdfFile'] == '': - script.append("analysis_run_command = 'openmmdl_analysis -t prot_lig_top%s -d prot_lig_traj%s -b %s -m %s -r %s -p %s' " % (top_ext, traj_ext, session['binding_mode'], session['min_transition'], session['rmsd_diff'], session['pml_generation'])) + script.append("analysis_run_command = 'openmmdl_analysis -t prot_lig_top%s -d prot_lig_traj%s -b %s -m %s -r %s -p %s -w %s --watereps %s' " % (top_ext, traj_ext, session['binding_mode'], session['min_transition'], session['rmsd_diff'], session['pml_generation'], session['stable_water'], session['wc_distance'])) elif fileType == 'amber': if session['nmLig'] or session['spLig']: - script.append("analysis_run_command = 'openmmdl_analysis -t centered_top%s -d centered_traj%s -l %s.sdf -n %s -b %s -m %s -r %s -p %s' " % (top_ext, traj_ext, nmLigFileName[:-4], nmLigName, session['binding_mode'], session['min_transition'], session['rmsd_diff'], session['pml_generation'])) + script.append("analysis_run_command = 'openmmdl_analysis -t centered_top%s -d centered_traj%s -l %s.sdf -n %s -b %s -m %s -r %s -p %s -w %s --watereps %s' " % (top_ext, traj_ext, nmLigFileName[:-4], nmLigName, session['binding_mode'], session['min_transition'], session['rmsd_diff'], session['pml_generation'], session['stable_water'], session['wc_distance'])) else: - script.append("analysis_run_command = 'openmmdl_analysis -t centered_top%s -d centered_traj%s -b %s -m %s -r %s -p %s' " % (top_ext, traj_ext, session['binding_mode'], session['min_transition'], session['rmsd_diff'], session['pml_generation'])) + script.append("analysis_run_command = 'openmmdl_analysis -t centered_top%s -d centered_traj%s -b %s -m %s -r %s -p %s -w %s --watereps %s' " % (top_ext, traj_ext, session['binding_mode'], session['min_transition'], session['rmsd_diff'], session['pml_generation'], session['stable_water'], session['wc_distance'])) elif session['analysis_selection'] == 'analysis_all_prot_lig': if fileType == "pdb": script.append("os.chdir('Final_Output/All_Atoms')") if session['sdfFile']: - script.append("analysis_run_command = 'openmmdl_analysis -t centered_top%s -d centered_traj%s -l %s -n UNK -b %s -m %s -r %s -p %s' " % (top_ext, traj_ext, session['sdfFile'], session['binding_mode'], session['min_transition'], session['rmsd_diff'], session['pml_generation'])) + script.append("analysis_run_command = 'openmmdl_analysis -t centered_top%s -d centered_traj%s -l %s -n UNK -b %s -m %s -r %s -p %s -w %s --watereps %s' " % (top_ext, traj_ext, session['sdfFile'], session['binding_mode'], session['min_transition'], session['rmsd_diff'], session['pml_generation'], session['stable_water'], session['wc_distance'])) script.append("os.chdir('../Prot_Lig')") - script.append("analysis_run_command = 'openmmdl_analysis -t prot_lig_top%s -d prot_lig_traj%s -l %s -n UNK -b %s -m %s -r %s -p %s' " % (top_ext, traj_ext, session['sdfFile'], session['binding_mode'], session['min_transition'], session['rmsd_diff'], session['pml_generation'])) + script.append("analysis_run_command = 'openmmdl_analysis -t prot_lig_top%s -d prot_lig_traj%s -l %s -n UNK -b %s -m %s -r %s -p %s -w %s --watereps %s' " % (top_ext, traj_ext, session['sdfFile'], session['binding_mode'], session['min_transition'], session['rmsd_diff'], session['pml_generation'], session['stable_water'], session['wc_distance'])) elif session['sdfFile'] == '': - script.append("analysis_run_command = 'openmmdl_analysis -t centered_top%s -d centered_traj%s -b %s -m %s -r %s -p %s' " % (top_ext, traj_ext, session['binding_mode'], session['min_transition'], session['rmsd_diff'], session['pml_generation'])) + script.append("analysis_run_command = 'openmmdl_analysis -t centered_top%s -d centered_traj%s -b %s -m %s -r %s -p %s -w %s --watereps %s' " % (top_ext, traj_ext, session['binding_mode'], session['min_transition'], session['rmsd_diff'], session['pml_generation'], session['stable_water'], session['wc_distance'])) script.append("os.chdir('../Prot_Lig')") - script.append("analysis_run_command = 'openmmdl_analysis -t prot_lig_top%s -d prot_lig_traj%s -b %s -m %s -r %s -p %s' " % (top_ext, traj_ext, session['binding_mode'], session['min_transition'], session['rmsd_diff'], session['pml_generation'])) + script.append("analysis_run_command = 'openmmdl_analysis -t prot_lig_top%s -d prot_lig_traj%s -b %s -m %s -r %s -p %s -w %s --watereps %s' " % (top_ext, traj_ext, session['binding_mode'], session['min_transition'], session['rmsd_diff'], session['pml_generation'], session['stable_water'], session['wc_distance'])) elif fileType == 'amber': script.append("os.chdir('Final_Output/All_Atoms')") if session['nmLig'] or session['spLig']: - script.append("analysis_run_command = 'openmmdl_analysis -t centered_top%s -d centered_traj%s -l %s.sdf -n %s -b %s -m %s -r %s -p %s' " % (top_ext, traj_ext, nmLigFileName[:-4], nmLigName, session['binding_mode'], session['min_transition'], session['rmsd_diff'], session['pml_generation'])) + script.append("analysis_run_command = 'openmmdl_analysis -t centered_top%s -d centered_traj%s -l %s.sdf -n %s -b %s -m %s -r %s -p %s -w %s --watereps %s' " % (top_ext, traj_ext, nmLigFileName[:-4], nmLigName, session['binding_mode'], session['min_transition'], session['rmsd_diff'], session['pml_generation'], session['stable_water'], session['wc_distance'])) script.append("os.chdir('../Prot_Lig')") - script.append("analysis_run_command = 'openmmdl_analysis -t centered_top%s -d centered_traj%s -l %s.sdf -n %s -b %s -m %s -r %s -p %s' " % (top_ext, traj_ext, nmLigFileName[:-4], nmLigName, session['binding_mode'], session['min_transition'], session['rmsd_diff'], session['pml_generation'])) + script.append("analysis_run_command = 'openmmdl_analysis -t centered_top%s -d centered_traj%s -l %s.sdf -n %s -b %s -m %s -r %s -p %s -w %s --watereps %s' " % (top_ext, traj_ext, nmLigFileName[:-4], nmLigName, session['binding_mode'], session['min_transition'], session['rmsd_diff'], session['pml_generation'], session['stable_water'], session['wc_distance'])) else: - script.append("analysis_run_command = 'openmmdl_analysis -t centered_top%s -d centered_traj%s -b %s -m %s -r %s -p %s' " % (top_ext, traj_ext, session['binding_mode'], session['min_transition'], session['rmsd_diff'], session['pml_generation'])) + script.append("analysis_run_command = 'openmmdl_analysis -t centered_top%s -d centered_traj%s -b %s -m %s -r %s -p %s -w %s --watereps %s' " % (top_ext, traj_ext, session['binding_mode'], session['min_transition'], session['rmsd_diff'], session['pml_generation'], session['stable_water'], session['wc_distance'])) script.append("os.chdir('../Prot_Lig')") - script.append("analysis_run_command = 'openmmdl_analysis -t centered_top%s -d centered_traj%s -b %s -m %s -r %s -p %s' " % (top_ext, traj_ext, session['binding_mode'], session['min_transition'], session['rmsd_diff'], session['pml_generation'])) + script.append("analysis_run_command = 'openmmdl_analysis -t centered_top%s -d centered_traj%s -b %s -m %s -r %s -p %s -w %s --watereps %s' " % (top_ext, traj_ext, session['binding_mode'], session['min_transition'], session['rmsd_diff'], session['pml_generation'], session['stable_water'], session['wc_distance'])) script.append("subprocess.run(analysis_run_command, shell=True, check=True)") return "\n".join(script) diff --git a/openmmdl/openmmdl_setup/templates/simulationOptions.html b/openmmdl/openmmdl_setup/templates/simulationOptions.html index df7de897..32a93ac3 100644 --- a/openmmdl/openmmdl_setup/templates/simulationOptions.html +++ b/openmmdl/openmmdl_setup/templates/simulationOptions.html @@ -223,10 +223,20 @@ {{ textfield('min_transition', 'Enter the minimal markov state transition for the figure generation without %.') }}
- - {{ choice('rmsd_diff', 'RMSD Calculation', [ - ('Yes', 'Yes', 'The RMSD will be calculated'), - ('No', 'No', 'The RMSD will not be calculated')]) }} + + {{ choice('rmsd_diff', 'RMSD Calculation', [ + ('Yes', 'Yes', 'The RMSD will be calculated'), + ('No', 'No', 'The RMSD will not be calculated')]) }} +
+
+ + {{ choice('stable_water', 'Perform stable water clustering', [ + ('True', 'Yes', 'Stable waters will be searched within the MD'), + ('False', 'No', 'No stable water analysis will be performed')]) }} +
+
+ + {{ textfield('wc_distance', 'Enter the max distance of waters that shall still be one cluster.') }}
@@ -377,9 +387,12 @@ function optionChanged() { // Update UI elements. openmmdl_analysis = document.getElementById("openmmdl_analysis").value; + stable_water = document.getElementById("stable_water").value; document.getElementById("analysis_selection").hidden = (openmmdl_analysis != 'Yes'); document.getElementById("rmsd_diff").hidden = (openmmdl_analysis != 'Yes'); document.getElementById("pml_generation").hidden = (openmmdl_analysis != 'Yes'); + document.getElementById("stable_water_container").hidden = (openmmdl_analysis != 'Yes'); + document.getElementById("wc_distance").hidden = (openmmdl_analysis != 'Yes' || stable_water != 'True'); document.getElementById("binding_mode").hidden = (openmmdl_analysis != 'Yes'); document.getElementById("min_transition").hidden = (openmmdl_analysis != 'Yes'); restart_checkpoint = document.getElementById("restart_checkpoint").value; From 19c2912b4503896ece7543ab45c30b5a9e2845fe Mon Sep 17 00:00:00 2001 From: Leon Obendorf <44522423+Habacef@users.noreply.github.com> Date: Fri, 15 Dec 2023 13:56:37 +0100 Subject: [PATCH 2/2] fix --- .../openmmdl_analysis/find_stable_waters.py | 54 ++++++++++--------- 1 file changed, 29 insertions(+), 25 deletions(-) diff --git a/openmmdl/openmmdl_analysis/find_stable_waters.py b/openmmdl/openmmdl_analysis/find_stable_waters.py index 846b7ba3..badf310d 100644 --- a/openmmdl/openmmdl_analysis/find_stable_waters.py +++ b/openmmdl/openmmdl_analysis/find_stable_waters.py @@ -9,14 +9,17 @@ # the following function is called by the last function in this script. It is only to be run if the water was already calculated. # finding the water molecules is the campute intensive task, thus i splitted it in the two subtasks. -def perform_clustering_and_writing(stable_waters, cluster_eps, output_directory): - def write_pdb_clusters_and_representatives(clustered_waters, min_samples): +def perform_clustering_and_writing(stable_waters, cluster_eps, total_frames, output_directory): + def write_pdb_clusters_and_representatives(clustered_waters, min_samples, output_directory): atom_counter = 1 pdb_file_counter = 1 print("cluster_eps:") print(cluster_eps) print("minsamples:") print(min_samples) + sub_output_directory = output_directory + "/clusterSize" + sub_output_directory += str(min_samples) + os.makedirs(sub_output_directory, exist_ok=True) with pd.option_context('display.max_rows', None): # Temporarily set display options for label, cluster in clustered_waters.groupby("Cluster_Label"): pdb_lines = [] @@ -29,10 +32,7 @@ def write_pdb_clusters_and_representatives(clustered_waters, min_samples): atom_counter += 1 # Write the current cluster to a new PDB file - output_directory += "_" - output_directory += min_samples - os.makedirs(output_directory, exist_ok=True) - output_filename = os.path.join(output_directory, f"cluster_{label}.pdb") + output_filename = os.path.join(sub_output_directory, f"cluster_{label}.pdb") with open(output_filename, "w") as pdb_file: pdb_file.write("".join(pdb_lines)) print(f"Cluster {label} written") @@ -42,7 +42,7 @@ def write_pdb_clusters_and_representatives(clustered_waters, min_samples): # Write representative water molecules to a PDB file representative_waters = clustered_waters.groupby("Cluster_Label").mean() representative_waters.reset_index(inplace=True) - representative_filename = os.path.join(output_directory, "representative_waters.pdb") + representative_filename = os.path.join(sub_output_directory, "representative_waters.pdb") with open(representative_filename, "w") as pdb_file: for index, row in representative_waters.iterrows(): x, y, z = row["Oxygen_X"], row["Oxygen_Y"], row["Oxygen_Z"] @@ -62,22 +62,22 @@ def write_pdb_clusters_and_representatives(clustered_waters, min_samples): # Rest of your code remains the same min_samples = int(min_percent * total_frames) - dbscan = DBSCAN(eps=cluster_eps, min_samples=min_samples) + dbscan = DBSCAN(eps=cluster_eps, min_samples=min_samples) labels = dbscan.fit_predict(X) - # Filter out noise and call the writing function - clustered_waters = stable_waters.copy() - clustered_waters["Cluster_Label"] = labels - clustered_waters = clustered_waters[clustered_waters["Cluster_Label"] != -1] # Remove noise + # Filter out noise and call the writing function + clustered_waters = stable_waters.copy() + clustered_waters["Cluster_Label"] = labels + clustered_waters = clustered_waters[clustered_waters["Cluster_Label"] != -1] # Remove noise - # Call the writing function - write_pdb_clusters_and_representatives(clustered_waters, min_samples) + # Call the writing function + write_pdb_clusters_and_representatives(clustered_waters, min_samples, output_directory) def process_trajectory_and_cluster(topology, trajectory, water_eps, output_directory="./stableWaters"): # Load the PDB and DCD files u = mda.Universe(topology, trajectory) output_directory += "_clusterEps_" - strEps = str(water_eps).replace(".", "_") + strEps = str(water_eps).replace(".", "") output_directory += strEps os.makedirs(output_directory, exist_ok=True) # Get the total number of frames for the progress bar @@ -193,16 +193,20 @@ def read_pdb_as_dataframe(pdb_file): # Encapsulate the code in a function def analyze_protein_and_water_interaction(protein_pdb_file, representative_waters_file, cluster_eps, output_directory="./stableWaters", distance_threshold=5.0): - representative_waters = read_pdb_as_dataframe(representative_waters_file) - filtered_structure = filter_and_parse_pdb(protein_pdb_file) - interacting_residues = find_interacting_residues(filtered_structure, representative_waters, distance_threshold) - - result_df = pd.DataFrame(interacting_residues.items(), columns=['Cluster_Number', 'Interacting_Residues']) - - # Export to CSV output_directory += "_clusterEps_" - strEps = str(cluster_eps).replace(".", "_") + strEps = str(cluster_eps).replace(".", "") output_directory += strEps - result_df.to_csv(os.path.join(output_directory, "interacting_residues.csv"), index=False) - print("Exported interacting_residues.csv") + + # Iterate over subdirectories + for subdirectory in os.listdir(output_directory): + subdirectory_path = os.path.join(output_directory, subdirectory) + if os.path.isdir(subdirectory_path): + # Perform operations within each subdirectory + representative_waters = read_pdb_as_dataframe(os.path.join(subdirectory_path, representative_waters_file)) + filtered_structure = filter_and_parse_pdb(protein_pdb_file) + interacting_residues = find_interacting_residues(filtered_structure, representative_waters, distance_threshold) + result_df = pd.DataFrame(interacting_residues.items(), columns=['Cluster_Number', 'Interacting_Residues']) + # Save result to each subdirectory + result_df.to_csv(os.path.join(subdirectory_path, "interacting_residues.csv"), index=False) + print(f"Exported interacting_residues.csv in {subdirectory_path}")