-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgather_xray_data.py
80 lines (64 loc) · 1.95 KB
/
gather_xray_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
#!/nsls2/conda/envs/2023-1.1-py39/bin/python3
# -*- coding: utf-8 -*-
"""
Created on Thu Aug 17 09:00:15 2023
@author: dkreitler
"""
import os
import pandas
import subprocess
import datetime
"""
INSTRUCTIONS:
Edit DATA_DIRECTORY to include path to xray data, e.g.
/nsls2/data/amx/proposals/2023-3/pass-313937/313937-20231105-dtime
Edit SAMPLE_NAME to include unique sample id in master.h5 filename
for including in the dataset
"""
DATA_DIRECTORY = ""
SAMPLE_NAME = ""
df = pandas.DataFrame()
find_cmd = ["find", f"{DATA_DIRECTORY}", "-maxdepth", "4", "-name", "*summary.csv"]
find_output = subprocess.check_output(find_cmd, universal_newlines=True)
csv_files = find_output.splitlines()
df = pandas.DataFrame()
for f in csv_files:
df_to_add = pandas.read_csv(f)
df_to_add["pipeline"] = ""
for index, row in df_to_add.iterrows():
df_to_add.at[index, "pipeline"] = f.split("/")[-1].split(".")[0]
df = pandas.concat([df, df_to_add], ignore_index=True)
# fix autoPROC -> autoProc
df["pipeline"][df["pipeline"] == "autoPROC"] = "autoProc"
# add xtal_id
df["xtal_id"] = ""
for index, row in df.iterrows():
df.at[index, "xtal_id"] = row["Sample_Path"].split("/")[0]
df_filtered = df[df["xtal_id"].str.contains(f"{SAMPLE_NAME}")]
final_df = df_filtered.groupby("xtal_id").apply(
lambda group: group.loc[group["Hi"].idxmin()]
)
# get list of all reflection files
find_cmd = [
"find",
f"{DATA_DIRECTORY}",
"-name",
"truncate-unique.mtz",
"-o",
"-name",
"fast_dp.mtz",
]
reflection_files = subprocess.check_output(
find_cmd, universal_newlines=True
).splitlines()
print(reflection_files)
# find file paths
final_df["filepath"] = ""
for index, row in final_df.iterrows():
for f in reflection_files:
if (row["Sample_Path"] in f) and (row["pipeline"] in f):
final_df.at[index, "filepath"] = f
print(final_df)
final_df.to_csv(
f"{SAMPLE_NAME}.{datetime.datetime.now().strftime('%Y%m%d')}.filtered.csv"
)