-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathbooking.py
561 lines (516 loc) · 21.9 KB
/
booking.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
from .utils import Dataset
from .utils import Selection
from .utils import Ntuple
from .utils import Cut
from .utils import Weight
from .utils import Action
from .utils import Count
from .utils import Histogram
from .utils import Variation
from ROOT import gROOT
gROOT.SetBatch(True)
from ROOT import TFile
import os
import re
import yaml
import itertools
from XRootD import client
import logging
logger = logging.getLogger(__name__)
def dataset_from_artusoutput(
dataset_name, file_names, folder, files_base_directory, friends_base_directories
):
"""Create a Dataset object from a list containing the names
of the ROOT files (e.g. [root_file1, root_file2, (...)]):
ntuple1: /file_base_dir/root_file1/folder/ntuple
friend1: /friend1_base_dir/root_file1/folder/ntuple
friend2: /friend2_base_dir/root_file1/folder/ntuple
ntuple2: /file_base_dir/root_file2/folder/ntuple
friend1: /friend1_base_dir/root_file2/folder/ntuple
friend2: /friend2_base_dir/root_file2/folder/ntuple
ntuple3: /file_base_dir/root_file3/folder/ntuple
friend1: /friend1_base_dir/root_file3/folder/ntuple
friend2: /friend2_base_dir/root_file3/folder/ntuple
(...)
Args:
dataset_name (str): Name of the dataset
file_names (list): List containing the names of the .root
files
folder (str): Name of the TDirectoryFile in each .root file
files_base_directory (str): Path to the files base directory (directories)
friends_base_directories (str, list): List of paths to
the friends base directory (directories)
Returns:
dataset (Dataset): Dataset object containing TTrees
"""
def get_full_tree_name(folder, path_to_root_file, tree_name):
root_file = TFile.Open(path_to_root_file)
if root_file.IsZombie():
logger.fatal("File {} does not exist, abort".format(path_to_root_file))
raise FileNotFoundError
if folder not in root_file.GetListOfKeys():
logger.fatal(
"Folder {} does not exist in {}\n".format(folder, path_to_root_file)
)
raise NameError
root_file.Close()
full_tree_name = "/".join([folder, tree_name])
return full_tree_name
def add_tagged_friends(friends):
"""Tag friends with the name of the different directories
in the artus name scheme, e.g.:
/common_path/MELA/ntuple -> tag: MELA
/common_path/SVFit/ntuple -> tag: SVFit
Since when we compare two ntuples (with full path) only one
directory changes in this scheme (see MELA vs SVFit), we
create a list called 'tags' with these two strings; then we
assign this string to friend.tag, if it's None
"""
for f1, f2 in itertools.combinations(friends, 2):
l1 = f1.path.split("/")
l2 = f2.path.split("/")
tags = list(set(l1).symmetric_difference(set(l2)))
if tags:
for t in tags:
if t in l1 and f1.tag is None:
f1.tag = t
elif t in l2 and f2.tag is None:
f2.tag = t
return friends
# E.g.: file_base_dir/file_name/file_name.root
root_files = [
os.path.join(files_base_directory, f, "{}.root".format(f)) for f in file_names
]
# E.g.: file_base_dir/file_name1/file_name1.root/folder/ntuple
# file_base_dir/file_name1/file_name2.root/folder/ntuple
ntuples = []
for root_file, file_name in zip(root_files, file_names):
tdf_tree = get_full_tree_name(folder, root_file, "ntuple")
friends = []
for friends_base_directory in friends_base_directories:
friend_path = os.path.join(
friends_base_directory, file_name, "{}.root".format(file_name)
)
tdf_tree_friend = get_full_tree_name(folder, friend_path, "ntuple")
if tdf_tree != tdf_tree_friend:
logger.fatal(
"Extracted wrong TDirectoryFile from friend which is not the same than the base file."
)
raise Exception
friends.append(Ntuple(friend_path, tdf_tree_friend))
ntuples.append(Ntuple(root_file, tdf_tree, add_tagged_friends(friends)))
return Dataset(dataset_name, ntuples)
def dataset_from_crownoutput(
dataset_name,
file_names,
era,
channel,
folder,
files_base_directory,
friends_base_directories=None,
validate_samples=False,
validation_tag="v1",
xrootd=False,
):
"""Create a Dataset object from a list containing the names
of the ROOT files (e.g. [root_file1, root_file2, (...)]):
ntuple1: /file_base_dir/root_file1/folder/ntuple
friend1: /friend1_base_dir/root_file1/folder/ntuple
friend2: /friend2_base_dir/root_file1/folder/ntuple
ntuple2: /file_base_dir/root_file2/folder/ntuple
friend1: /friend1_base_dir/root_file2/folder/ntuple
friend2: /friend2_base_dir/root_file2/folder/ntuple
ntuple3: /file_base_dir/root_file3/folder/ntuple
friend1: /friend1_base_dir/root_file3/folder/ntuple
friend2: /friend2_base_dir/root_file3/folder/ntuple
(...)
Args:
dataset_name (str): Name of the dataset
file_names (list): List containing the names of the .root
files
channel (str): Name of the considered channel, needed for directories
folder (str): Name of the TDirectoryFile in each .root file
files_base_directory (str): Path to the files base directory (directories)
friends_base_directories (str, list): List of paths to
the friends base directory (directories)
Returns:
dataset (Dataset): Dataset object containing TTrees
"""
def get_quantities_per_variation(root_file):
quantities_per_vars = {}
quantities_with_variations = root_file.Get("ntuple").GetListOfLeaves()
for qwv in quantities_with_variations:
qwv_name = qwv.GetName()
if "__" in qwv_name:
quantity, var = qwv_name.split("__")
if var not in quantities_per_vars.keys():
quantities_per_vars[var] = []
quantities_per_vars[var].append(quantity)
if "-" in qwv_name:
logger.warning(
"Found a '-' in quantity name {} - This can result in unwanted behaviour for systematic shifts".format(
qwv_name
)
)
return quantities_per_vars
def is_root_file_empty(file_path):
try:
# Try to open the ROOT file
root_file = TFile.Open(file_path)
# Check if the file was opened successfully
if not root_file or root_file.IsZombie():
print(f"Failed to open file '{file_path}' or it is a zombie.")
return True
# Check if the file contains any keys
if root_file.GetNkeys() == 0:
print(f"File '{file_path}' contains no keys.")
root_file.Close()
return True
if "ntuple" not in [x.GetTitle() for x in root_file.GetListOfKeys()]:
return True
# If we reach here, the file is not empty
root_file.Close()
# print(f"File '{file_path}' is not empty.")
return False
except Exception as e:
print(f"An error occurred while opening the file: {e}")
return True
def add_tagged_friends(friends):
"""Tag friends with the name of the different directories
in the CROWN name scheme, e.g.:
/common_path/CROWNFriends/fastmtt/ntuple -> tag: fastmtt
/common_path/CROWNMultiFriends/NNclassification/ntuple -> tag: NNclassification
Since using CROWN two folders containing friends could be present, `CROWNFriends`
and `CROWNMultiFriends`, both have to be checked. The next folder in the hierarchy
gives us the friend tag independent of friend or multifriend.
"""
for friend in friends:
path_split_list = friend.path.split("/")
if "CROWNFriends" in friend.path:
idx = path_split_list.index("CROWNFriends")
elif "CROWNMultiFriends" in friend.path:
idx = path_split_list.index("CROWNMultiFriends")
friend_tag = path_split_list[idx+1]
if friend.tag is None:
friend.tag = friend_tag
return friends
def populate_val_database(root_file_path, validation_dict, friends):
def extract_quantities(infile):
if infile.IsZombie():
logger.fatal("File {} does not exist, abort".format(infile.GetName()))
raise FileNotFoundError
if "ntuple" not in [x.GetTitle() for x in infile.GetListOfKeys()]:
quantities = set()
else:
quantities = set(
[x.GetName() for x in infile.Get("ntuple").GetListOfLeaves()]
)
return quantities
# Check if file can be opened and is not empty
# Check if file can be opened and is not empty
quantities = {}
if is_root_file_empty(root_file_path):
#validation_dict["quantities_per_vars"] = {}
is_empty = True
else:
is_empty = False
root_file = TFile.Open(root_file_path)
quantities = extract_quantities(root_file)
if "quantities_per_vars" in validation_dict or is_empty:
pass
else:
validation_dict["quantities_per_vars"] = get_quantities_per_variation(
root_file
)
root_file.Close()
# Do the same for the friend trees
friend_quantitites = set()
friend_empty = False
for f in friends:
if is_root_file_empty(f) or is_empty:
friend_empty = True
else:
friend = TFile.Open(f)
fr_quants = extract_quantities(friend)
friend.Close()
friend_quantitites.update(fr_quants)
# first we check the main ntuple, then the friends
fileinfo = {}
fileinfo["is_empty"] = is_empty
fileinfo["friend_is_empty"] = friend_empty
if len(validation_dict["varset"]) == 0:
validation_dict["varset"] = quantities
difference = set()
else:
difference = validation_dict["varset"].symmetric_difference(quantities)
fileinfo["difference"] = difference
if len(validation_dict["friends_varset"]) == 0:
validation_dict["friends_varset"] = friend_quantitites
difference = set()
else:
difference = validation_dict["friends_varset"].symmetric_difference(
friend_quantitites
)
fileinfo["friends"] = friends
fileinfo["friends_difference"] = difference
validation_dict["files"][root_file_path] = fileinfo
return
# files_base_directory: ntuple/era
# friends_base_directory: friends/friend_type/era
root_files = []
# Set up reading of file system via xrootd bindings
if xrootd:
fsname = "root://cmsdcache-kit-disk.gridka.de:1094"
xrdclient = client.FileSystem(fsname)
for f in file_names:
status, listing = xrdclient.dirlist(
os.path.join("", files_base_directory, era, f, channel)
)
try:
for g in listing:
# os.path.join omits parts with colons as it thinks they are drives,
# use default join instead
filepath = "/".join([fsname,os.path.join(files_base_directory, era, f, channel, g.name),])
if filepath.endswith(".root"):
root_files.append((filepath, f))
except TypeError:
logger.error(
"Could not read file list from directory {}".format(
os.path.join(files_base_directory, era, f, channel)
)
)
raise TypeError
else:
for f in file_names:
for g in os.listdir(os.path.join(files_base_directory, era, f, channel)):
# only consider files with .root in the end
filepath = (os.path.join(files_base_directory, era, f, channel, g), f)
if filepath[0].endswith(".root"):
root_files.append(filepath)
ntuples = []
read_from_database = False
db_path = os.path.join("validation_database",validation_tag)
if os.path.exists(f"{db_path}/{era}_{channel}_{dataset_name}.yaml"):
logger.info(
"Reading validation information for dataset {} - {} - {}".format(
era, channel, dataset_name
)
)
with open(f"{db_path}/{era}_{channel}_{dataset_name}.yaml") as fi:
validation_dict = yaml.safe_load(fi)
read_from_database = True
else:
logger.info(
"Running ntuple validation for {} - {} - {}".format(
era, channel, dataset_name
)
)
validation_dict = {"varset": set(), "friends_varset": set(), "files": {}}
for root_file, file_name in root_files:
tdf_tree = "ntuple"
friends = []
friend_paths = []
for friends_base_directory in friends_base_directories:
friend_base_name = os.path.basename(root_file)
if xrootd:
friend_path = "/".join(
[
fsname,
os.path.join(
friends_base_directory,
era,
file_name,
channel,
friend_base_name,
),
]
)
else:
friend_path = os.path.join(
friends_base_directory, era, file_name, channel, friend_base_name
)
friend_paths.append(friend_path)
if not read_from_database:
populate_val_database(root_file, validation_dict, friend_paths)
if root_file not in validation_dict["files"]:
raise ValueError(
"File {} not found in validation results.".format(root_file)
)
if not validation_dict["files"][root_file]["friend_is_empty"]:
for friend_path in friend_paths:
friends.append(Ntuple(friend_path, tdf_tree))
if not validation_dict["files"][root_file]["is_empty"]:
ntuples.append(Ntuple(root_file, tdf_tree, add_tagged_friends(friends)))
# Perform check of number of files
if len(validation_dict["files"].keys()) > len(root_files):
miss_files = set(validation_dict["files"].keys()).difference(
set(rfile[0] for rfile in root_files)
)
logger.fatal(
f"Number of files expected and in database do not agree for dataset {dataset_name}.\n"
f"The missing files are: {miss_files}"
)
raise ValueError(
f"Number of files expected and in database do not agree for dataset {dataset_name}."
)
# Report on obtained validation information for dataset
found_error = False
for root_file, _ in root_files:
diff = validation_dict["files"][root_file]["difference"]
friends_diff = validation_dict["files"][root_file]["friends_difference"]
if len(diff) != 0 or len(friends_diff) != 0:
found_error = True
logger.fatal(
"Validation for {} - {} - {} failed, differences were found".format(
era, channel, dataset_name
)
)
if len(diff) != 0:
logger.fatal("File {} has the following differences:".format(root_file))
logger.fatal("\t{}".format(diff))
if len(friends_diff) != 0:
logger.fatal(
"Friends for file {} have the following differences:".format(
root_file
)
)
logger.fatal("\t{}".format(friends_diff))
if not found_error:
logger.info(
"Validation for {} - {} - {} passed".format(era, channel, dataset_name)
)
# Write the created database
if not read_from_database:
db_path = os.path.join("validation_database",validation_tag)
if not os.path.exists(db_path):
os.makedirs(db_path)
logger.info(
"Writing validation info for {e} - {c} - {dn} to {db_path}/{e}_{c}_{dn}.yaml".format(
e=era, c=channel, dn=dataset_name, db_path=db_path
)
)
with open(
f"{db_path}/{era}_{channel}_{dataset_name}.yaml", "w"
) as outfi:
yaml.safe_dump(validation_dict, outfi, sort_keys=True, indent=4)
return Dataset(
dataset_name,
ntuples,
quantities_per_vars=validation_dict["quantities_per_vars"],
)
class Unit:
"""
Building element of a minimal analysis flow, consisting
of a dataset, a set of selections to apply on the data
and a set of actions.
Args:
dataset (Dataset): Set of TTree objects to run the
analysis on
selections (list): List of Selection-type objects
actions (Action): Actions to perform on the processed
dataset, can be 'Histogram' or 'Count'
variation (Variation): Variations applied, meaning
that this selection is the result of a variation
applied on other selections
Attributes:
dataset (Dataset): Set of TTree objects to run the
analysis on
selections (list): List of Selection-type objects
actions (Action): Actions to perform on the processed
dataset, can be 'Histogram' or 'Count'
variation (Variation): Variations applied, meaning
that this selection is the result of a variation
applied on other selections
"""
def __init__(self, dataset, selections, actions, variation=None):
self.__set_dataset(dataset)
self.__set_selections(selections)
self.__set_actions(actions, variation)
def __str__(self):
layout = "\n".join(
[
"Dataset: {}".format(self.dataset.name),
"Selections: {}".format(self.selections),
"Actions: {}".format(self.actions),
]
)
return layout
def __set_dataset(self, dataset):
if not isinstance(dataset, Dataset):
raise TypeError("not a Dataset object.")
self.dataset = dataset
def __set_selections(self, selections):
if not isinstance(selections, list):
raise TypeError("not a list object.")
for selection in selections:
if not isinstance(selection, Selection):
raise TypeError("not a Selection object.")
self.selections = selections
def __set_actions(self, actions, variation):
if not isinstance(actions, list):
raise TypeError("not a list object.")
for action in actions:
if not isinstance(action, Action):
raise TypeError("not an Action object.")
self.actions = [self.__set_new_action(action, variation) for action in actions]
def __set_new_action(self, action, variation):
if variation is None:
name = "#".join(
[
self.dataset.name,
"-".join([selection.name for selection in self.selections]),
"Nominal",
action.name,
]
)
else:
if not isinstance(variation, Variation):
raise TypeError("not a Variation object.")
self.variation = variation
name = action.name.replace("Nominal", self.variation.name)
if isinstance(action, Histogram):
return Histogram(name, action.variable, action.edges)
elif isinstance(action, Count):
return Count(name, action.variable)
def __eq__(self, other):
return (
self.dataset == other.dataset
and self.selections == other.selections
and self.actions == other.actions
)
def __hash__(self):
return hash((self.dataset, tuple(self.selections), tuple(self.actions)))
class UnitManager:
"""
Manager of all the Unit objects that are created.
It can both be initialized with a variable amount of Unit
objects as arguments or with no arguments, with the above mentioned
objects added in a second time with the function 'book'.
Attributes:
booked_units (list): List of the booked units, updated during
initialization or with the function 'book'
"""
booked_units = []
def book(self, units, variations=None, enable_check=True):
for unit in units:
if unit not in self.booked_units:
self.booked_units.append(unit)
if variations:
for variation in variations:
logger.debug("Applying variation {}".format(variation))
for unit in units:
self.apply_variation(unit, variation)
if enable_check:
for action1, action2 in itertools.combinations(
[j for i in [unit.actions for unit in self.booked_units] for j in i], 2
):
if action1.name == action2.name:
logger.fatal(
"Caught two actions with same name ({}, {})".format(
action1.name, action2.name
)
)
raise NameError
def apply_variation(self, unit, variation):
new_unit = variation.create(unit)
self.booked_units.append(new_unit)