Skip to content

Commit

Permalink
Merge pull request #47 from Habacef/main
Browse files Browse the repository at this point in the history
Allow user to select & configure stable water-analysis during setup
  • Loading branch information
talagayev authored Dec 15, 2023
2 parents 2242d69 + 19c2912 commit 3416a17
Show file tree
Hide file tree
Showing 4 changed files with 98 additions and 51 deletions.
75 changes: 48 additions & 27 deletions openmmdl/openmmdl_analysis/find_stable_waters.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,17 @@

# the following function is called by the last function in this script. It is only to be run if the water was already calculated.
# finding the water molecules is the campute intensive task, thus i splitted it in the two subtasks.
def perform_clustering_and_writing(stable_waters, cluster_eps, min_samples, output_directory):
def write_pdb_clusters_and_representatives(clustered_waters):
def perform_clustering_and_writing(stable_waters, cluster_eps, total_frames, output_directory):
def write_pdb_clusters_and_representatives(clustered_waters, min_samples, output_directory):
atom_counter = 1
pdb_file_counter = 1
print("cluster_eps:")
print(cluster_eps)
print("minsamples:")
print(min_samples)
sub_output_directory = output_directory + "/clusterSize"
sub_output_directory += str(min_samples)
os.makedirs(sub_output_directory, exist_ok=True)
with pd.option_context('display.max_rows', None): # Temporarily set display options
for label, cluster in clustered_waters.groupby("Cluster_Label"):
pdb_lines = []
Expand All @@ -29,7 +32,7 @@ def write_pdb_clusters_and_representatives(clustered_waters):
atom_counter += 1

# Write the current cluster to a new PDB file
output_filename = os.path.join(output_directory, f"cluster_{label}.pdb")
output_filename = os.path.join(sub_output_directory, f"cluster_{label}.pdb")
with open(output_filename, "w") as pdb_file:
pdb_file.write("".join(pdb_lines))
print(f"Cluster {label} written")
Expand All @@ -39,7 +42,7 @@ def write_pdb_clusters_and_representatives(clustered_waters):
# Write representative water molecules to a PDB file
representative_waters = clustered_waters.groupby("Cluster_Label").mean()
representative_waters.reset_index(inplace=True)
representative_filename = "representative_waters.pdb"
representative_filename = os.path.join(sub_output_directory, "representative_waters.pdb")
with open(representative_filename, "w") as pdb_file:
for index, row in representative_waters.iterrows():
x, y, z = row["Oxygen_X"], row["Oxygen_Y"], row["Oxygen_Z"]
Expand All @@ -48,27 +51,37 @@ def write_pdb_clusters_and_representatives(clustered_waters):

# Feature extraction: XYZ coordinates
X = stable_waters[["Oxygen_X", "Oxygen_Y", "Oxygen_Z"]]

# List of percentages to iterate over
percentage_values = [25, 50, 75, 90, 99]

# Apply DBSCAN clustering
dbscan = DBSCAN(eps=cluster_eps, min_samples=min_samples)
labels = dbscan.fit_predict(X)
# Assuming total_frames and X are defined outside of this snippet
for percent in percentage_values:
# Adjust min_percent based on the current iteration
min_percent = percent / 100

# Filter out noise (clusters with fewer than a threshold number of points)
clustered_waters = stable_waters.copy()
clustered_waters["Cluster_Label"] = labels
clustered_waters = clustered_waters[clustered_waters["Cluster_Label"] != -1] # Remove noise
# Rest of your code remains the same
min_samples = int(min_percent * total_frames)
dbscan = DBSCAN(eps=cluster_eps, min_samples=min_samples)
labels = dbscan.fit_predict(X)

# Call the writing function
write_pdb_clusters_and_representatives(clustered_waters)
# Filter out noise and call the writing function
clustered_waters = stable_waters.copy()
clustered_waters["Cluster_Label"] = labels
clustered_waters = clustered_waters[clustered_waters["Cluster_Label"] != -1] # Remove noise

def process_trajectory_and_cluster(topology, trajectory, cluster_eps=1.0, min_percent=0.45, output_directory="./stableWaters"):
# Call the writing function
write_pdb_clusters_and_representatives(clustered_waters, min_samples, output_directory)

def process_trajectory_and_cluster(topology, trajectory, water_eps, output_directory="./stableWaters"):
# Load the PDB and DCD files
u = mda.Universe(topology, trajectory)
output_directory += "_clusterEps_"
strEps = str(water_eps).replace(".", "")
output_directory += strEps
os.makedirs(output_directory, exist_ok=True)
# Get the total number of frames for the progress bar
total_frames = len(u.trajectory)
min_samples = min_percent*total_frames
min_samples = int(min_samples)
# Create an empty DataFrame to store stable water coordinates
stable_waters = pd.DataFrame(columns=["Frame", "Residue", "Oxygen_X", "Oxygen_Y", "Oxygen_Z"])

Expand Down Expand Up @@ -110,10 +123,10 @@ def process_trajectory_and_cluster(topology, trajectory, cluster_eps=1.0, min_pe
stable_waters.to_csv(os.path.join(output_directory, "stable_waters.csv"), index=False)

# Call the clustering and writing function with the stable_waters DataFrame and output directory
perform_clustering_and_writing(stable_waters, cluster_eps, min_samples, output_directory)
perform_clustering_and_writing(stable_waters, water_eps, total_frames, output_directory)

# Call the function with the desired water type and specify the output directory
# process_trajectory_and_cluster("your_topology.pdb", "your_trajectory.dcd", cluster_eps=1.0, min_samples=1500, output_directory=".")
# process_trajectory_and_cluster("your_topology.pdb", "your_trajectory.dcd", water_eps=1.0, min_samples=1500, output_directory=".")



Expand Down Expand Up @@ -179,13 +192,21 @@ def read_pdb_as_dataframe(pdb_file):
return representative_waters

# Encapsulate the code in a function
def analyze_protein_and_water_interaction(protein_pdb_file, representative_waters_file, distance_threshold=5.0):
representative_waters = read_pdb_as_dataframe(representative_waters_file)
filtered_structure = filter_and_parse_pdb(protein_pdb_file)
interacting_residues = find_interacting_residues(filtered_structure, representative_waters, distance_threshold)

result_df = pd.DataFrame(interacting_residues.items(), columns=['Cluster_Number', 'Interacting_Residues'])
def analyze_protein_and_water_interaction(protein_pdb_file, representative_waters_file, cluster_eps, output_directory="./stableWaters", distance_threshold=5.0):
output_directory += "_clusterEps_"
strEps = str(cluster_eps).replace(".", "")
output_directory += strEps

# Export to CSV
result_df.to_csv("interacting_residues.csv", index=False)
print("Exported interacting_residues.csv")
# Iterate over subdirectories
for subdirectory in os.listdir(output_directory):
subdirectory_path = os.path.join(output_directory, subdirectory)
if os.path.isdir(subdirectory_path):
# Perform operations within each subdirectory
representative_waters = read_pdb_as_dataframe(os.path.join(subdirectory_path, representative_waters_file))
filtered_structure = filter_and_parse_pdb(protein_pdb_file)
interacting_residues = find_interacting_residues(filtered_structure, representative_waters, distance_threshold)
result_df = pd.DataFrame(interacting_residues.items(), columns=['Cluster_Number', 'Interacting_Residues'])
# Save result to each subdirectory
result_df.to_csv(os.path.join(subdirectory_path, "interacting_residues.csv"), index=False)
print(f"Exported interacting_residues.csv in {subdirectory_path}")

16 changes: 13 additions & 3 deletions openmmdl/openmmdl_analysis/openmmdlanalysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,8 @@ def main():
parser.add_argument('-nuc', dest='receptor_nucleic', help='Treat nucleic acids as receptor', default=False)
parser.add_argument('-s', dest='special_ligand', help='Calculate interactions with special ligands', default=None)
parser.add_argument('-pep', dest='peptide', help='Calculate interactions with peptides. Give the peptides chain name as input. Defaults to None', default=None)
parser.add_argument('-w', dest='stable_water_analysis', help='Should stable water analysis be performed? True or False', default=False)
parser.add_argument('--watereps', dest='water_eps', help='Set the Eps for clustering, this defines how big clusters can be spatially in Angstrom', default=1.0)

input_formats = ['.pdb', '.dcd', '.sdf', '.csv']
args = parser.parse_args()
Expand All @@ -78,11 +80,14 @@ def main():
topology = args.topology
trajectory = args.trajectory

if not args.ligand_sdf and args.peptide == None:
water_eps = float(args.water_eps)
stable_water_analysis = bool(args.stable_water_analysis)
# The following is the current water analysis if no ligand is present.
if not args.ligand_sdf and args.peptide == None and stable_water_analysis:
print("All analyses will be run which can be done without a ligand present")
#...
process_trajectory_and_cluster(topology, trajectory)
analyze_protein_and_water_interaction(topology,"representative_waters.pdb")
process_trajectory_and_cluster(topology, trajectory, water_eps)
analyze_protein_and_water_interaction(topology,"representative_waters.pdb", water_eps)
sys.exit()


Expand Down Expand Up @@ -508,6 +513,11 @@ def main():
print("\033[1mPharmacophores generated\033[0m")

print("\033[1mAnalysis is Finished.\033[0m")

if stable_water_analysis:
process_trajectory_and_cluster(topology, trajectory, water_eps)
analyze_protein_and_water_interaction(topology,"representative_waters.pdb")



if __name__ == "__main__":
Expand Down
Loading

0 comments on commit 3416a17

Please sign in to comment.