-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcreate_audio_from_json.py
221 lines (169 loc) · 9.09 KB
/
create_audio_from_json.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
"""
import json
import numpy as np
import os
import soundfile as sf
from constants import SR_AUDIO, MAX_AMP
from paths import (udase_chime_5_audio_path, librispeech_path,
voicehome_path, reverberant_librichime_5_json_path, reverberant_librichime_5_audio_path)
from tqdm import tqdm
import scipy as sp
def compute_loudness(x):
return 10*np.log10(np.sum((x - np.mean(x))**2))
def create_reverberant_speech(mix_infos, dtype, voicehome_path, librispeech_path):
mix_len = mix_infos['length']
speakers = [x for x in list(mix_infos.keys()) if 'speaker_' in x]
speech_sigs = np.zeros((mix_len, len(speakers)), dtype=dtype)
for spk_ind, spk in enumerate(speakers):
# get speaker info
spk_infos = mix_infos[spk]
spk_utts = spk_infos['utterances']
# get RIR info
rir_infos = spk_infos['RIR']
rir_file = rir_infos['file']
rir_channel = rir_infos['channel']
# read RIR file
rir_path = os.path.join(voicehome_path, rir_file)
rir_sig, sr = sf.read(rir_path)
rir_sig = rir_sig[:, rir_channel]
assert sr == SR_AUDIO
# for each speaker's utterance
for utt in spk_utts:
# read utterance info
utt_file = utt['file']
start_librispeech = utt['start_librispeech']
end_librispeech = utt['end_librispeech']
start_mix = utt['start_mix']
end_mix = utt['end_mix']
utt_len = end_mix - start_mix
rir_len = rir_sig.shape[0]
# read speech file
speech_path = os.path.join(librispeech_path, utt_file)
speech_sig, sr = sf.read(speech_path)
assert sr == SR_AUDIO
# add reverberation
if start_mix==0 and end_mix==mix_len:
# utterance spans the entire mix:
# clip the end of the wet utterance so that it fits in
# in the mixture
speech_sig_cut = speech_sig[start_librispeech:end_librispeech]
rev_speech_sig = sp.signal.fftconvolve(speech_sig_cut, rir_sig,
mode='full')
rev_speech_sig = rev_speech_sig[:utt_len]
speech_sigs[start_mix:end_mix, spk_ind] = rev_speech_sig
elif start_mix==0 and end_mix!=mix_len:
# utterance is at the beginning of the mix:
# clip the beginning of the wet utterance so that the
# reverberant tail is preserved and it fits in
# [start_mix, end_mix]
speech_sig_cut = speech_sig[start_librispeech:end_librispeech]
rev_speech_sig = sp.signal.fftconvolve(speech_sig_cut, rir_sig,
mode='full')
rev_speech_sig = rev_speech_sig[-utt_len:]
speech_sigs[start_mix:end_mix, spk_ind] = rev_speech_sig
elif start_mix!= 0 and end_mix==mix_len:
# utterance is at the end of the mix
# clip the end of the wet utterance so that it fits in
# [start_mix, end_mix]
speech_sig_cut = speech_sig[start_librispeech:end_librispeech]
rev_speech_sig = sp.signal.fftconvolve(speech_sig_cut, rir_sig,
mode='full')
rev_speech_sig = rev_speech_sig[:utt_len]
speech_sigs[start_mix:end_mix, spk_ind] = rev_speech_sig
else:
# utterance in the middle of the mix
# we do not clip and allow the unintelligible reverberant
# tail to extend beyond initial utterance length
speech_sig_cut = speech_sig[start_librispeech:end_librispeech]
rev_speech_sig = sp.signal.fftconvolve(speech_sig_cut, rir_sig,
mode='full')
new_end_mix = end_mix + rir_len - 1
if new_end_mix <= mix_len:
speech_sigs[start_mix:new_end_mix, spk_ind] = rev_speech_sig
else:
# clip wet utterance if it extends beyond the mixture length
speech_sigs[start_mix:mix_len, spk_ind] = rev_speech_sig[:mix_len-start_mix]
return speech_sigs
def main():
for subset in ['dev', 'eval']:
# paths
dataset_json_path = output_path = os.path.join(reverberant_librichime_5_json_path,
subset + '.json')
output_path = os.path.join(reverberant_librichime_5_audio_path, subset)
# create output dir if necessary
if not os.path.isdir(output_path):
os.makedirs(output_path)
# load metadata
with open(dataset_json_path) as f:
dataset = json.load(f)
# create audio mixtures
for mix_infos in tqdm(dataset, total=len(dataset)):
# get mixture info
mix_name = mix_infos['name']
mix_len = mix_infos['length']
mix_max_n_spk = mix_infos['max_num_sim_active_speakers']
speakers = [x for x in list(mix_infos.keys()) if 'speaker_' in x]
# read noise file
noise_file = mix_infos['noise']['filename']
noise_path = os.path.join(udase_chime_5_audio_path, subset, '0', noise_file+ '.wav')
noise_sig, sr = sf.read(noise_path)
if len(noise_sig.shape) == 2:
noise_sig = noise_sig[:,1]
assert noise_sig.shape[0] == mix_len
assert sr == SR_AUDIO
# compute noise loudness
noise_loudness = compute_loudness(noise_sig)
assert not(np.isinf(noise_loudness))
# create reverberant speech signals for all speakers
speech_sigs = create_reverberant_speech(mix_infos, noise_sig.dtype,
voicehome_path,
librispeech_path)
# mix reverberant speech signals
speech_mix_sig = np.zeros(mix_len, dtype=noise_sig.dtype)
for spk_ind, spk in enumerate(speakers):
# get infos
spk_infos = mix_infos[spk]
# get speech signal
speech_sig = speech_sigs[:, spk_ind]
# compute speech loudness
speech_loudness = compute_loudness(speech_sig)
assert not(np.isinf(speech_loudness))
# compute original SNR
orig_snr = speech_loudness - noise_loudness
# get per-speaker snr
snr_spk = spk_infos['SNR']
# compute speech gain
# we scale the speech signal and not the noise signal
# to keep the original loudness of the CHiME data
speech_gain = 10**( (snr_spk - orig_snr)/20.0)
# scale speech
scaled_speech_sig = speech_sig * speech_gain
# check new snr
speech_loudness_new = compute_loudness(scaled_speech_sig)
new_snr = speech_loudness_new - noise_loudness
assert np.isclose(snr_spk, new_snr)
# add scaled speech to mixture
speech_mix_sig += scaled_speech_sig
# mix speech and noise
mix_sig = noise_sig + speech_mix_sig
# handle clipping
if np.max(np.abs(mix_sig)) > 1.0 or np.max(np.abs(speech_mix_sig)) > 1.0:
scale_clipping = MAX_AMP/max(np.max(np.abs(mix_sig)),
np.max(np.abs(speech_mix_sig)))
mix_sig = mix_sig*scale_clipping
speech_mix_sig = speech_mix_sig*scale_clipping
noise_sig = noise_sig*scale_clipping
# save audio files
if not os.path.isdir(os.path.join(output_path, str(mix_max_n_spk))):
os.makedirs(os.path.join(output_path, str(mix_max_n_spk)))
output_mix_file = os.path.join(output_path, str(mix_max_n_spk), mix_name + '_mix.wav')
sf.write(output_mix_file, mix_sig, SR_AUDIO, 'PCM_16')
output_speech_file = os.path.join(output_path, str(mix_max_n_spk), mix_name + '_speech.wav')
sf.write(output_speech_file, speech_mix_sig, SR_AUDIO, 'PCM_16')
output_noise_file = os.path.join(output_path, str(mix_max_n_spk), mix_name + '_noise.wav')
sf.write(output_noise_file, noise_sig, SR_AUDIO, 'PCM_16')
if __name__ == "__main__":
main()