From 1457922cd9956404408ad4fae6829e2265d9f6fd Mon Sep 17 00:00:00 2001 From: Valerij Talagayev <82884038+talagayev@users.noreply.github.com> Date: Fri, 1 Dec 2023 20:04:12 +0100 Subject: [PATCH] Delete openmmdl/openmmdl_analysis/write_all_stable_waters(2).py removed --- .../write_all_stable_waters(2).py | 196 ------------------ 1 file changed, 196 deletions(-) delete mode 100644 openmmdl/openmmdl_analysis/write_all_stable_waters(2).py diff --git a/openmmdl/openmmdl_analysis/write_all_stable_waters(2).py b/openmmdl/openmmdl_analysis/write_all_stable_waters(2).py deleted file mode 100644 index a1219022..00000000 --- a/openmmdl/openmmdl_analysis/write_all_stable_waters(2).py +++ /dev/null @@ -1,196 +0,0 @@ -import MDAnalysis as mda -import pandas as pd -import numpy as np -from io import StringIO -from sklearn.cluster import DBSCAN -from tqdm import tqdm -from Bio.PDB import PDBParser - - -# the following function is called by the last function in this script. It is only to be run if the water was already calculated. -# finding the water molecules is the campute intensive task, thus i splitted it in the two subtasks. -def perform_clustering_and_writing(stable_waters, cluster_eps=1.0, min_samples=1500): - def write_pdb_clusters_and_representatives(clustered_waters): - atom_counter = 1 - pdb_file_counter = 1 - - with pd.option_context('display.max_rows', None): # Temporarily set display options - for label, cluster in clustered_waters.groupby("Cluster_Label"): - pdb_lines = [] - - for _, row in cluster.iterrows(): - x, y, z = row["Oxygen_X"], row["Oxygen_Y"], row["Oxygen_Z"] - atom_counter = 1 if atom_counter > 9999 else atom_counter - pdb_line = f"ATOM{atom_counter:6} O WAT A{atom_counter:4} {x:8.3f}{y:8.3f}{z:8.3f} 1.00 0.00 O\n" - pdb_lines.append(pdb_line) - atom_counter += 1 - - # Write the current cluster to a new PDB file - output_filename = f"cluster_{label}.pdb" - with open(output_filename, "w") as pdb_file: - pdb_file.write("".join(pdb_lines)) - - pdb_file_counter += 1 - - # Write representative water molecules to a PDB file - representative_waters = clustered_waters.groupby("Cluster_Label").mean() - representative_waters.reset_index(inplace=True) - representative_filename = "representative_waters.pdb" - with open(representative_filename, "w") as pdb_file: - for index, row in representative_waters.iterrows(): - x, y, z = row["Oxygen_X"], row["Oxygen_Y"], row["Oxygen_Z"] - pdb_line = f"ATOM{index + 1:6} O WAT A{index + 1:4} {x:8.3f}{y:8.3f}{z:8.3f} 1.00 0.00 O\n" - pdb_file.write(pdb_line) - - # Feature extraction: XYZ coordinates - X = stable_waters[["Oxygen_X", "Oxygen_Y", "Oxygen_Z"]] - - # Apply DBSCAN clustering - dbscan = DBSCAN(eps=cluster_eps, min_samples=min_samples) - labels = dbscan.fit_predict(X) - - # Filter out noise (clusters with fewer than a threshold number of points) - clustered_waters = stable_waters.copy() - clustered_waters["Cluster_Label"] = labels - clustered_waters = clustered_waters[clustered_waters["Cluster_Label"] != -1] # Remove noise - - # Call the writing function - write_pdb_clusters_and_representatives(clustered_waters) - -def process_trajectory_and_cluster(water_type, cluster_eps=1.0, min_samples=1500): - # Load the PDB and DCD files - u = mda.Universe("2.pdb", "2.dcd") - - # Get the total number of frames for the progress bar - total_frames = len(u.trajectory) - - # Create an empty DataFrame to store stable water coordinates - stable_waters = pd.DataFrame(columns=["Frame", "Residue", "Oxygen_X", "Oxygen_Y", "Oxygen_Z"]) - - # Initialize variables for the previous frame's data - prev_frame_coords = {} - - # Iterate through frames with tqdm for the progress bar - for ts in tqdm(u.trajectory, total=total_frames, desc="Processing frames"): - frame_num = ts.frame - frame_coords = {} - - # Iterate through oxygen atoms of the specified water type - for atom in u.select_atoms(f"resname {water_type} and name O"): - frame_coords[atom.index] = (atom.position[0], atom.position[1], atom.position[2]) - - # Check if it's not the first frame - if frame_num > 0: - stable_coords = [] - - # Iterate through the oxygen atoms in the current frame - for atom_index, coords in frame_coords.items(): - prev_coords = prev_frame_coords.get(atom_index, coords) # Get previous coordinates or use current if not found - - # Calculate the distance between current and previous coordinates - distance = np.linalg.norm(np.array(coords) - np.array(prev_coords)) - - # If the distance is less than 1 Angstrom, consider it a stable water - if distance < 1: - stable_coords.append((frame_num, atom_index, coords[0], coords[1], coords[2])) - - # Append stable water coordinates to the stable_waters DataFrame - if stable_coords: # Check if stable_coords is not empty - stable_waters = pd.concat([stable_waters, pd.DataFrame(stable_coords, columns=["Frame", "Residue", "Oxygen_X", "Oxygen_Y", "Oxygen_Z"])]) - - # Update the previous frame's coordinates - prev_frame_coords = frame_coords - - # Save the stable_waters DataFrame to a CSV file - stable_waters.to_csv(f"stable_waters_{water_type}.csv", index=False) - - # Call the clustering and writing function with the stable_waters DataFrame - perform_clustering_and_writing(stable_waters, cluster_eps, min_samples) - -# Call the function with the desired water type -# process_trajectory_and_cluster("T4P") - -# process_trajectory_and_cluster("SPC") -# process_trajectory_and_cluster("T3P") - -def filter_and_parse_pdb(protein_pdb): - with open(protein_pdb, 'r') as pdb_file: - lines = [ - line for line in pdb_file if ( - line.startswith('ATOM') and - line[17:20].strip() not in ['HOH', 'WAT'] and - line[22:26].strip().isdigit() # Exclude lines with non-numeric sequence identifiers - ) - ] - - # Convert the list of lines to a string buffer - pdb_string = ''.join(lines) - pdb_buffer = StringIO(pdb_string) - - # Now parse the filtered lines - parser = PDBParser(QUIET=True) - structure = parser.get_structure("protein", pdb_buffer) - - return structure - -def find_interacting_residues(structure, representative_waters, distance_threshold): - interacting_residues = {} - - for model in structure: - for chain in model: - for residue in chain: - # Check if the residue is a protein residue (not a heteroatom or water molecule) - if residue.id[0] == ' ' and residue.id[2] == ' ' and residue.resname not in ['HOH', 'WAT']: - for wat_index, wat_row in representative_waters.iterrows(): - wat_coords = np.array([wat_row["Oxygen_X"], wat_row["Oxygen_Y"], wat_row["Oxygen_Z"]]) - residue_coords = np.array([atom.get_coord() for atom in residue.get_atoms()][0]) - - distance = np.linalg.norm(wat_coords - residue_coords) - if distance < distance_threshold: - key = wat_index # Assuming wat_index is the number of the water cluster - if key not in interacting_residues: - interacting_residues[key] = [] - interacting_residues[key].append((chain.id, residue.id[1])) - - return interacting_residues - -def read_pdb_as_dataframe(pdb_file): - lines = [] - with open(pdb_file, 'r') as f: - lines = f.readlines() - - # Extract relevant information from PDB file lines - data = [] - for line in lines: - if line.startswith("ATOM"): - x = float(line[30:38].strip()) - y = float(line[38:46].strip()) - z = float(line[46:54].strip()) - data.append([x, y, z]) - - # Create a DataFrame - columns = ["Oxygen_X", "Oxygen_Y", "Oxygen_Z"] - representative_waters = pd.DataFrame(data, columns=columns) - - return representative_waters - -# Encapsulate the code in a function -def analyze_protein_and_water_interaction(protein_pdb_file, representative_waters_file, distance_threshold=5.0): - representative_waters = read_pdb_as_dataframe(representative_waters_file) - filtered_structure = filter_and_parse_pdb(protein_pdb_file) - interacting_residues = find_interacting_residues(filtered_structure, representative_waters, distance_threshold) - - result_df = pd.DataFrame(interacting_residues.items(), columns=['Cluster_Number', 'Interacting_Residues']) - - # Export to CSV - result_df.to_csv("interacting_residues.csv", index=False) - print("Exported interacting_residues.csv") - -# Example usage -#protein_pdb_file = "2.pdb" -#representative_waters_file = "representative_waters.pdb" -#analyze_protein_and_water_interaction(protein_pdb_file, representative_waters_file) - - - -