From 6109a611c5eb6552fecf0daed5f8bb0b9a76de75 Mon Sep 17 00:00:00 2001 From: Leon Obendorf <44522423+Habacef@users.noreply.github.com> Date: Fri, 15 Dec 2023 11:52:32 +0100 Subject: [PATCH 1/2] Water-analysis control during setup Added the option to control stable-water analysis during setup. Also optimized the output folder structure of the analysis and set to analyze water-clusters on different amount of cluster-sizes (water pesent in 25%, 50%, 75%, 90% and 99% of the MD) --- .../openmmdl_analysis/find_stable_waters.py | 55 ++++++++++++------- .../openmmdl_analysis/openmmdlanalysis.py | 16 +++++- openmmdl/openmmdl_setup/openmmdlsetup.py | 37 +++++++------ .../templates/simulationOptions.html | 21 +++++-- 4 files changed, 86 insertions(+), 43 deletions(-) diff --git a/openmmdl/openmmdl_analysis/find_stable_waters.py b/openmmdl/openmmdl_analysis/find_stable_waters.py index ef221a27..846b7ba3 100644 --- a/openmmdl/openmmdl_analysis/find_stable_waters.py +++ b/openmmdl/openmmdl_analysis/find_stable_waters.py @@ -9,8 +9,8 @@ # the following function is called by the last function in this script. It is only to be run if the water was already calculated. # finding the water molecules is the campute intensive task, thus i splitted it in the two subtasks. -def perform_clustering_and_writing(stable_waters, cluster_eps, min_samples, output_directory): - def write_pdb_clusters_and_representatives(clustered_waters): +def perform_clustering_and_writing(stable_waters, cluster_eps, output_directory): + def write_pdb_clusters_and_representatives(clustered_waters, min_samples): atom_counter = 1 pdb_file_counter = 1 print("cluster_eps:") @@ -29,6 +29,9 @@ def write_pdb_clusters_and_representatives(clustered_waters): atom_counter += 1 # Write the current cluster to a new PDB file + output_directory += "_" + output_directory += min_samples + os.makedirs(output_directory, exist_ok=True) output_filename = os.path.join(output_directory, f"cluster_{label}.pdb") with open(output_filename, "w") as pdb_file: pdb_file.write("".join(pdb_lines)) @@ -39,7 +42,7 @@ def write_pdb_clusters_and_representatives(clustered_waters): # Write representative water molecules to a PDB file representative_waters = clustered_waters.groupby("Cluster_Label").mean() representative_waters.reset_index(inplace=True) - representative_filename = "representative_waters.pdb" + representative_filename = os.path.join(output_directory, "representative_waters.pdb") with open(representative_filename, "w") as pdb_file: for index, row in representative_waters.iterrows(): x, y, z = row["Oxygen_X"], row["Oxygen_Y"], row["Oxygen_Z"] @@ -48,27 +51,37 @@ def write_pdb_clusters_and_representatives(clustered_waters): # Feature extraction: XYZ coordinates X = stable_waters[["Oxygen_X", "Oxygen_Y", "Oxygen_Z"]] + + # List of percentages to iterate over + percentage_values = [25, 50, 75, 90, 99] - # Apply DBSCAN clustering - dbscan = DBSCAN(eps=cluster_eps, min_samples=min_samples) - labels = dbscan.fit_predict(X) + # Assuming total_frames and X are defined outside of this snippet + for percent in percentage_values: + # Adjust min_percent based on the current iteration + min_percent = percent / 100 - # Filter out noise (clusters with fewer than a threshold number of points) - clustered_waters = stable_waters.copy() - clustered_waters["Cluster_Label"] = labels - clustered_waters = clustered_waters[clustered_waters["Cluster_Label"] != -1] # Remove noise + # Rest of your code remains the same + min_samples = int(min_percent * total_frames) + dbscan = DBSCAN(eps=cluster_eps, min_samples=min_samples) + labels = dbscan.fit_predict(X) - # Call the writing function - write_pdb_clusters_and_representatives(clustered_waters) + # Filter out noise and call the writing function + clustered_waters = stable_waters.copy() + clustered_waters["Cluster_Label"] = labels + clustered_waters = clustered_waters[clustered_waters["Cluster_Label"] != -1] # Remove noise -def process_trajectory_and_cluster(topology, trajectory, cluster_eps=1.0, min_percent=0.45, output_directory="./stableWaters"): + # Call the writing function + write_pdb_clusters_and_representatives(clustered_waters, min_samples) + +def process_trajectory_and_cluster(topology, trajectory, water_eps, output_directory="./stableWaters"): # Load the PDB and DCD files u = mda.Universe(topology, trajectory) + output_directory += "_clusterEps_" + strEps = str(water_eps).replace(".", "_") + output_directory += strEps os.makedirs(output_directory, exist_ok=True) # Get the total number of frames for the progress bar total_frames = len(u.trajectory) - min_samples = min_percent*total_frames - min_samples = int(min_samples) # Create an empty DataFrame to store stable water coordinates stable_waters = pd.DataFrame(columns=["Frame", "Residue", "Oxygen_X", "Oxygen_Y", "Oxygen_Z"]) @@ -110,10 +123,10 @@ def process_trajectory_and_cluster(topology, trajectory, cluster_eps=1.0, min_pe stable_waters.to_csv(os.path.join(output_directory, "stable_waters.csv"), index=False) # Call the clustering and writing function with the stable_waters DataFrame and output directory - perform_clustering_and_writing(stable_waters, cluster_eps, min_samples, output_directory) + perform_clustering_and_writing(stable_waters, water_eps, total_frames, output_directory) # Call the function with the desired water type and specify the output directory -# process_trajectory_and_cluster("your_topology.pdb", "your_trajectory.dcd", cluster_eps=1.0, min_samples=1500, output_directory=".") +# process_trajectory_and_cluster("your_topology.pdb", "your_trajectory.dcd", water_eps=1.0, min_samples=1500, output_directory=".") @@ -179,7 +192,7 @@ def read_pdb_as_dataframe(pdb_file): return representative_waters # Encapsulate the code in a function -def analyze_protein_and_water_interaction(protein_pdb_file, representative_waters_file, distance_threshold=5.0): +def analyze_protein_and_water_interaction(protein_pdb_file, representative_waters_file, cluster_eps, output_directory="./stableWaters", distance_threshold=5.0): representative_waters = read_pdb_as_dataframe(representative_waters_file) filtered_structure = filter_and_parse_pdb(protein_pdb_file) interacting_residues = find_interacting_residues(filtered_structure, representative_waters, distance_threshold) @@ -187,5 +200,9 @@ def analyze_protein_and_water_interaction(protein_pdb_file, representative_water result_df = pd.DataFrame(interacting_residues.items(), columns=['Cluster_Number', 'Interacting_Residues']) # Export to CSV - result_df.to_csv("interacting_residues.csv", index=False) + output_directory += "_clusterEps_" + strEps = str(cluster_eps).replace(".", "_") + output_directory += strEps + result_df.to_csv(os.path.join(output_directory, "interacting_residues.csv"), index=False) print("Exported interacting_residues.csv") + diff --git a/openmmdl/openmmdl_analysis/openmmdlanalysis.py b/openmmdl/openmmdl_analysis/openmmdlanalysis.py index 30cef20b..ed6ca187 100644 --- a/openmmdl/openmmdl_analysis/openmmdlanalysis.py +++ b/openmmdl/openmmdl_analysis/openmmdlanalysis.py @@ -62,6 +62,8 @@ def main(): parser.add_argument('-nuc', dest='receptor_nucleic', help='Treat nucleic acids as receptor', default=False) parser.add_argument('-s', dest='special_ligand', help='Calculate interactions with special ligands', default=None) parser.add_argument('-pep', dest='peptide', help='Calculate interactions with peptides. Give the peptides chain name as input. Defaults to None', default=None) + parser.add_argument('-w', dest='stable_water_analysis', help='Should stable water analysis be performed? True or False', default=False) + parser.add_argument('--watereps', dest='water_eps', help='Set the Eps for clustering, this defines how big clusters can be spatially in Angstrom', default=1.0) input_formats = ['.pdb', '.dcd', '.sdf', '.csv'] args = parser.parse_args() @@ -74,11 +76,14 @@ def main(): topology = args.topology trajectory = args.trajectory - if not args.ligand_sdf and args.peptide == None: + water_eps = float(args.water_eps) + stable_water_analysis = bool(args.stable_water_analysis) + # The following is the current water analysis if no ligand is present. + if not args.ligand_sdf and args.peptide == None and stable_water_analysis: print("All analyses will be run which can be done without a ligand present") #... - process_trajectory_and_cluster(topology, trajectory) - analyze_protein_and_water_interaction(topology,"representative_waters.pdb") + process_trajectory_and_cluster(topology, trajectory, water_eps) + analyze_protein_and_water_interaction(topology,"representative_waters.pdb", water_eps) sys.exit() @@ -495,6 +500,11 @@ def main(): print("\033[1mPharmacophores generated\033[0m") print("\033[1mAnalysis is Finished.\033[0m") + + if stable_water_analysis: + process_trajectory_and_cluster(topology, trajectory, water_eps) + analyze_protein_and_water_interaction(topology,"representative_waters.pdb") + if __name__ == "__main__": diff --git a/openmmdl/openmmdl_setup/openmmdlsetup.py b/openmmdl/openmmdl_setup/openmmdlsetup.py index 67680125..7eb73e0f 100644 --- a/openmmdl/openmmdl_setup/openmmdlsetup.py +++ b/openmmdl/openmmdl_setup/openmmdlsetup.py @@ -821,7 +821,9 @@ def configureDefaultOptions(): session['binding_mode'] = '40' session['min_transition'] = '1' session['rmsd_diff'] = 'No' - session['pml_generation'] = 'True' + session['pml_generation'] = 'True' + session['stable_water'] = 'Yes' + session['wc_distance'] = '1.0' if session['fileType'] == 'pdb' and session['waterModel'] == 'implicit': implicitWater = True session['ensemble'] = 'nvt' if implicitWater else 'npt' @@ -1283,6 +1285,7 @@ def _xml_script_segment(to_serialize, target_file): lines = [line.format(filename=session['finalStateFilename']) for line in state_script] script.extend(lines) + if session ['md_postprocessing'] == 'True': if fileType == "pdb": script.append("mdtraj_conversion(f'Equilibration_{protein}', '%s')" % session['mdtraj_output']) @@ -1336,47 +1339,47 @@ def _xml_script_segment(to_serialize, target_file): script.append("os.chdir('Final_Output/All_Atoms')") if fileType == "pdb": if session['sdfFile']: - script.append("analysis_run_command = 'openmmdl_analysis -t centered_top%s -d centered_traj%s -l %s -n UNK -b %s -m %s -r %s -p %s' " % (top_ext, traj_ext, session['sdfFile'], session['binding_mode'], session['min_transition'], session['rmsd_diff'], session['pml_generation'])) + script.append("analysis_run_command = 'openmmdl_analysis -t centered_top%s -d centered_traj%s -l %s -n UNK -b %s -m %s -r %s -p %s -w %s --watereps %s' " % (top_ext, traj_ext, session['sdfFile'], session['binding_mode'], session['min_transition'], session['rmsd_diff'], session['pml_generation'], session['stable_water'], session['wc_distance'])) elif session['sdfFile'] == '': - script.append("analysis_run_command = 'openmmdl_analysis -t centered_top%s -d centered_traj%s -b %s -m %s -r %s -p %s' " % (top_ext, traj_ext, session['binding_mode'], session['min_transition'], session['rmsd_diff'], session['pml_generation'])) + script.append("analysis_run_command = 'openmmdl_analysis -t centered_top%s -d centered_traj%s -b %s -m %s -r %s -p %s -w %s --watereps %s' " % (top_ext, traj_ext, session['binding_mode'], session['min_transition'], session['rmsd_diff'], session['pml_generation'], session['stable_water'], session['wc_distance'])) elif fileType == "amber": if session['nmLig'] or session['spLig']: - script.append("analysis_run_command = 'openmmdl_analysis -t centered_top%s -d centered_traj%s -l %s.sdf -n %s -b %s -m %s -r %s -p %s' " % (top_ext, traj_ext, nmLigFileName[:-4], nmLigName, session['binding_mode'], session['min_transition'], session['rmsd_diff'], session['pml_generation'])) + script.append("analysis_run_command = 'openmmdl_analysis -t centered_top%s -d centered_traj%s -l %s.sdf -n %s -b %s -m %s -r %s -p %s --watereps %s' " % (top_ext, traj_ext, nmLigFileName[:-4], nmLigName, session['binding_mode'], session['min_transition'], session['rmsd_diff'], session['pml_generation'], session['stable_water'], session['wc_distance'])) else: - script.append("analysis_run_command = 'openmmdl_analysis -t centered_top%s -d centered_traj%s -b %s -m %s -r %s -p %s' " % (top_ext, traj_ext, session['binding_mode'], session['min_transition'], session['rmsd_diff'], session['pml_generation'])) + script.append("analysis_run_command = 'openmmdl_analysis -t centered_top%s -d centered_traj%s -b %s -m %s -r %s -p %s -w %s --watereps %s' " % (top_ext, traj_ext, session['binding_mode'], session['min_transition'], session['rmsd_diff'], session['pml_generation'], session['stable_water'], session['wc_distance'])) elif session['analysis_selection'] == 'analysis_prot_lig': script.append("os.chdir('Final_Output/Prot_Lig')") if fileType == "pdb": if session['sdfFile']: - script.append("analysis_run_command = 'openmmdl_analysis -t prot_lig_top%s -d prot_lig_traj%s -l %s -n UNK -b %s -m %s -r %s -p %s' " % (top_ext, traj_ext, session['sdfFile'], session['binding_mode'], session['min_transition'], session['rmsd_diff'], session['pml_generation'])) + script.append("analysis_run_command = 'openmmdl_analysis -t prot_lig_top%s -d prot_lig_traj%s -l %s -n UNK -b %s -m %s -r %s -p %s -w %s --watereps %s' " % (top_ext, traj_ext, session['sdfFile'], session['binding_mode'], session['min_transition'], session['rmsd_diff'], session['pml_generation'], session['stable_water'], session['wc_distance'])) elif session['sdfFile'] == '': - script.append("analysis_run_command = 'openmmdl_analysis -t prot_lig_top%s -d prot_lig_traj%s -b %s -m %s -r %s -p %s' " % (top_ext, traj_ext, session['binding_mode'], session['min_transition'], session['rmsd_diff'], session['pml_generation'])) + script.append("analysis_run_command = 'openmmdl_analysis -t prot_lig_top%s -d prot_lig_traj%s -b %s -m %s -r %s -p %s -w %s --watereps %s' " % (top_ext, traj_ext, session['binding_mode'], session['min_transition'], session['rmsd_diff'], session['pml_generation'], session['stable_water'], session['wc_distance'])) elif fileType == 'amber': if session['nmLig'] or session['spLig']: - script.append("analysis_run_command = 'openmmdl_analysis -t centered_top%s -d centered_traj%s -l %s.sdf -n %s -b %s -m %s -r %s -p %s' " % (top_ext, traj_ext, nmLigFileName[:-4], nmLigName, session['binding_mode'], session['min_transition'], session['rmsd_diff'], session['pml_generation'])) + script.append("analysis_run_command = 'openmmdl_analysis -t centered_top%s -d centered_traj%s -l %s.sdf -n %s -b %s -m %s -r %s -p %s -w %s --watereps %s' " % (top_ext, traj_ext, nmLigFileName[:-4], nmLigName, session['binding_mode'], session['min_transition'], session['rmsd_diff'], session['pml_generation'], session['stable_water'], session['wc_distance'])) else: - script.append("analysis_run_command = 'openmmdl_analysis -t centered_top%s -d centered_traj%s -b %s -m %s -r %s -p %s' " % (top_ext, traj_ext, session['binding_mode'], session['min_transition'], session['rmsd_diff'], session['pml_generation'])) + script.append("analysis_run_command = 'openmmdl_analysis -t centered_top%s -d centered_traj%s -b %s -m %s -r %s -p %s -w %s --watereps %s' " % (top_ext, traj_ext, session['binding_mode'], session['min_transition'], session['rmsd_diff'], session['pml_generation'], session['stable_water'], session['wc_distance'])) elif session['analysis_selection'] == 'analysis_all_prot_lig': if fileType == "pdb": script.append("os.chdir('Final_Output/All_Atoms')") if session['sdfFile']: - script.append("analysis_run_command = 'openmmdl_analysis -t centered_top%s -d centered_traj%s -l %s -n UNK -b %s -m %s -r %s -p %s' " % (top_ext, traj_ext, session['sdfFile'], session['binding_mode'], session['min_transition'], session['rmsd_diff'], session['pml_generation'])) + script.append("analysis_run_command = 'openmmdl_analysis -t centered_top%s -d centered_traj%s -l %s -n UNK -b %s -m %s -r %s -p %s -w %s --watereps %s' " % (top_ext, traj_ext, session['sdfFile'], session['binding_mode'], session['min_transition'], session['rmsd_diff'], session['pml_generation'], session['stable_water'], session['wc_distance'])) script.append("os.chdir('../Prot_Lig')") - script.append("analysis_run_command = 'openmmdl_analysis -t prot_lig_top%s -d prot_lig_traj%s -l %s -n UNK -b %s -m %s -r %s -p %s' " % (top_ext, traj_ext, session['sdfFile'], session['binding_mode'], session['min_transition'], session['rmsd_diff'], session['pml_generation'])) + script.append("analysis_run_command = 'openmmdl_analysis -t prot_lig_top%s -d prot_lig_traj%s -l %s -n UNK -b %s -m %s -r %s -p %s -w %s --watereps %s' " % (top_ext, traj_ext, session['sdfFile'], session['binding_mode'], session['min_transition'], session['rmsd_diff'], session['pml_generation'], session['stable_water'], session['wc_distance'])) elif session['sdfFile'] == '': - script.append("analysis_run_command = 'openmmdl_analysis -t centered_top%s -d centered_traj%s -b %s -m %s -r %s -p %s' " % (top_ext, traj_ext, session['binding_mode'], session['min_transition'], session['rmsd_diff'], session['pml_generation'])) + script.append("analysis_run_command = 'openmmdl_analysis -t centered_top%s -d centered_traj%s -b %s -m %s -r %s -p %s -w %s --watereps %s' " % (top_ext, traj_ext, session['binding_mode'], session['min_transition'], session['rmsd_diff'], session['pml_generation'], session['stable_water'], session['wc_distance'])) script.append("os.chdir('../Prot_Lig')") - script.append("analysis_run_command = 'openmmdl_analysis -t prot_lig_top%s -d prot_lig_traj%s -b %s -m %s -r %s -p %s' " % (top_ext, traj_ext, session['binding_mode'], session['min_transition'], session['rmsd_diff'], session['pml_generation'])) + script.append("analysis_run_command = 'openmmdl_analysis -t prot_lig_top%s -d prot_lig_traj%s -b %s -m %s -r %s -p %s -w %s --watereps %s' " % (top_ext, traj_ext, session['binding_mode'], session['min_transition'], session['rmsd_diff'], session['pml_generation'], session['stable_water'], session['wc_distance'])) elif fileType == 'amber': script.append("os.chdir('Final_Output/All_Atoms')") if session['nmLig'] or session['spLig']: - script.append("analysis_run_command = 'openmmdl_analysis -t centered_top%s -d centered_traj%s -l %s.sdf -n %s -b %s -m %s -r %s -p %s' " % (top_ext, traj_ext, nmLigFileName[:-4], nmLigName, session['binding_mode'], session['min_transition'], session['rmsd_diff'], session['pml_generation'])) + script.append("analysis_run_command = 'openmmdl_analysis -t centered_top%s -d centered_traj%s -l %s.sdf -n %s -b %s -m %s -r %s -p %s -w %s --watereps %s' " % (top_ext, traj_ext, nmLigFileName[:-4], nmLigName, session['binding_mode'], session['min_transition'], session['rmsd_diff'], session['pml_generation'], session['stable_water'], session['wc_distance'])) script.append("os.chdir('../Prot_Lig')") - script.append("analysis_run_command = 'openmmdl_analysis -t centered_top%s -d centered_traj%s -l %s.sdf -n %s -b %s -m %s -r %s -p %s' " % (top_ext, traj_ext, nmLigFileName[:-4], nmLigName, session['binding_mode'], session['min_transition'], session['rmsd_diff'], session['pml_generation'])) + script.append("analysis_run_command = 'openmmdl_analysis -t centered_top%s -d centered_traj%s -l %s.sdf -n %s -b %s -m %s -r %s -p %s -w %s --watereps %s' " % (top_ext, traj_ext, nmLigFileName[:-4], nmLigName, session['binding_mode'], session['min_transition'], session['rmsd_diff'], session['pml_generation'], session['stable_water'], session['wc_distance'])) else: - script.append("analysis_run_command = 'openmmdl_analysis -t centered_top%s -d centered_traj%s -b %s -m %s -r %s -p %s' " % (top_ext, traj_ext, session['binding_mode'], session['min_transition'], session['rmsd_diff'], session['pml_generation'])) + script.append("analysis_run_command = 'openmmdl_analysis -t centered_top%s -d centered_traj%s -b %s -m %s -r %s -p %s -w %s --watereps %s' " % (top_ext, traj_ext, session['binding_mode'], session['min_transition'], session['rmsd_diff'], session['pml_generation'], session['stable_water'], session['wc_distance'])) script.append("os.chdir('../Prot_Lig')") - script.append("analysis_run_command = 'openmmdl_analysis -t centered_top%s -d centered_traj%s -b %s -m %s -r %s -p %s' " % (top_ext, traj_ext, session['binding_mode'], session['min_transition'], session['rmsd_diff'], session['pml_generation'])) + script.append("analysis_run_command = 'openmmdl_analysis -t centered_top%s -d centered_traj%s -b %s -m %s -r %s -p %s -w %s --watereps %s' " % (top_ext, traj_ext, session['binding_mode'], session['min_transition'], session['rmsd_diff'], session['pml_generation'], session['stable_water'], session['wc_distance'])) script.append("subprocess.run(analysis_run_command, shell=True, check=True)") return "\n".join(script) diff --git a/openmmdl/openmmdl_setup/templates/simulationOptions.html b/openmmdl/openmmdl_setup/templates/simulationOptions.html index df7de897..32a93ac3 100644 --- a/openmmdl/openmmdl_setup/templates/simulationOptions.html +++ b/openmmdl/openmmdl_setup/templates/simulationOptions.html @@ -223,10 +223,20 @@ {{ textfield('min_transition', 'Enter the minimal markov state transition for the figure generation without %.') }}