-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdclde_process_detections.m
173 lines (155 loc) · 6.5 KB
/
dclde_process_detections.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
function dclde_process_detections(base_dir)
% dclde_process_detections(base_dir)
% Custom Matlab script for converting silbido format detections
% to Tethys
%
% This is a one off script designed for the DCLDE 2022 data set,
% but can be adapted to convert detections in general.
%
% Note that we assume the deployment documents have already been
% added to the Tethys database. (These documents are queried.)
%
% base_dir is a string containing the root directory to be processed
% We expect the following subdirectories
% detections - contains silbido detections
% xml/detections - directory where XML files will be stored
% (created if it does not exist)
% If base_dir is not provided, the current working directory is used.
if nargin < 1
base_dir = '.';
end
% source and output directories
detections_dir = fullfile(base_dir, 'detections');
xml_dir = fullfile(base_dir, 'xml');
out_dir = fullfile(xml_dir, 'detections');
if ~ isdir(xml_dir)
mkdir(xml_dir);
end
if ~ isdir(out_dir)
mkdir(out_dir);
end
% Tethys server, modify host name as appropriate
q = dbInit('Server', 'localhost');
project = 'DCLDE2022';
% Get information about Tethys deployments
dep = dbGetDeployments(q, 'Project', project);
ids = string(arrayfun(@(x) x.Deployment.Id, dep));
starts = arrayfun(@(x) x.Deployment.DeploymentDetails.AudioTimeStamp, dep);
starts = datetime(vertcat(starts{:}), 'ConvertFrom', 'datenum');
stops = arrayfun(@(x) x.Deployment.RecoveryDetails.AudioTimeStamp, dep);
stops = datetime(vertcat(stops{:}), 'ConvertFrom', 'datenum');
cruise = string(arrayfun(@(x) x.Deployment.Cruise, dep));
deployments = table(ids', cruise', starts, stops, ...
'VariableNames', {'Id', 'cruise', 'start', 'stop'});
cruises = ["Lasker", "Sette"];
directories = ["1705", "1706"];
preparer = 'Peter Conant'; % person who prepared data
species = 180404; % odontocetes
for cruise_idx = 1:length(cruises)
path = fullfile(detections_dir, char(directories(cruise_idx)));
% Get detection files
[paths,names,~] = utFindFiles('*.det', {path}, 1);
[paths, timestamps] = getSortedNames(paths, names);
% DCLDE 2022 files are one minute apart.
% Find contiguous regions that are a single effort unit
delta = diff(timestamps);
maxgap = duration(0,1,59);
boundaries = [1; find(delta > maxgap)+1];
% toodo: cruise_idx 2 group_idx 12 fails java heap error about 500
% files in
for group_idx = 1:length(boundaries)
start = boundaries(group_idx);
if group_idx == length(boundaries)
stop = length(paths); % last group
else
stop = boundaries(group_idx+1) - 1;
end
% If there are very many detections, we can run into heap issues
% generating the XML. The right thing to do would be to split
% the group intelligently, paying attention to how many whistles
% are in each file. Here we simply split based on number of files
% without worrying about how many whistles are in each file
groupbyN = 120;
fprintf('group %d/%d %d files\n', ...
group_idx, length(boundaries), stop-start+1);
if (stop - start) > groupbyN
stop_intermediate = start+groupbyN-1;
while start <= stop
range = start:stop_intermediate;
process_detections(out_dir, project, cruises(cruise_idx), ...
paths(range), timestamps(range), ...
start, stop, deployments, preparer, species);
start = start+groupbyN;
stop_intermediate = min(stop, start+groupbyN-1);
end
else
range = start:stop;
process_detections(out_dir, project, cruises(cruise_idx), ...
paths(range), timestamps(range), ...
start, stop, deployments, preparer, species);
end
end
end
function [paths, timestamps] = getSortedNames(paths, names)
% [paths, timestamps] = getSortedNames
% Extract timestamps from file names and return filepaths
% sorted by time and their associated timestampd
% paths - full path to file
% names - filename only
% Pull timestamps from file names.
% DCLDE 2022 filesnames have pattern _YYYYMMDD_HHMMSS_FFF
match = regexp(names, '\d+_(?<timestamp>\d+_\d+_\d+)', 'names');
% Make sure we were able to find the timestamps in filenames
failures = cellfun(@isempty, match);
if any(failures)
error('Unable to extract timestamps for files\n%s\nEnd unable to extract timestamps', ...
strjoin(names(failures), '\n'));
end
% collect timestamp strings from cells and parse into datetimes
date_str = string(cellfun(@(x) x.timestamp, match, ...
'UniformOutput', false));
timestamps = datetime(date_str, 'InputFormat', 'yyyyMMdd_HHmmss_SSS');
% Sort by timestamp
if ~ issorted(timestamps)
[timestamps, permutation] = sort(timestamps);
% Reorder files to match sort order
paths = paths(permutation);
end
function process_detections(out_dir, project, cruise, paths, timestamps, start, stop, deployments, preparer, species)
% process_detections
% Generate a detection document.
% Each document has a unique id and filename based on the project,
% the name and the start time.
%
% out_dir - directory in which to store the detection document
% project - overall project identifier
% cruise - cruise or other identifying string
% paths - list of detection files
% timestamps - start time of each detection file
% start - effort start
% stop - effort end
% deployments - Information about deployments, we query this to find
% the correct deployment to associate with the detection effort
% preparer - Person who conducted the analysis
% species - species identifier (ITIS taxonomic serial number)
file_duration = seconds(60); % DCLDE 2022 files are this many s long
id = sprintf('%s_silbido_%s_%s', project, cruise, ...
datestr(timestamps(1), 'yyyy-mm-ddTHH-MM'));
% Identify the deployment with which these detections are associated
% We assume that any date within start and stop falls within a single
% deployment.
deployment = deployments( ...
deployments.cruise == cruise & ...
deployments.start <= timestamps(1) & ...
deployments.stop >= timestamps(1), :);
xml_file = fullfile(out_dir, sprintf('%s.xml', id));
start_eff = timestamps(1);
end_eff = timestamps(end)+file_duration;
fprintf('processing %d files into %s. Effort %s - %s\n', length(paths), ...
xml_file, start_eff, end_eff);
showonly = false;
if ~ showonly
dclde_detections2xml(xml_file, id, ...
paths, timestamps, start_eff, end_eff, ...
deployment.Id, preparer, species);
end