-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcreate_toucan_files.py
executable file
·103 lines (95 loc) · 3.97 KB
/
create_toucan_files.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
#!/usr/bin/env bash
"""true" '''\'
set -e
eval "$(conda shell.bash hook)"
conda deactivate
conda activate cherokee-audio-data
exec python "$0" "$@"
exit $?
''"""
import os
import shutil
from pydub import AudioSegment
from pydub.silence import split_on_silence
def main():
ims_toucan_basename = "ims-toucan"
data_sets: list[str] = ["comvoi_clean", "cstr-vctk-american"]
# get list of languages
langs: set[str] = set()
voices: set[str] = set()
for folder in data_sets:
all_txt = os.path.join(folder, "all.txt")
if not os.path.exists(all_txt):
continue
with open(all_txt) as r:
for line in r:
line = line.strip()
parts = line.split("|")
langs.add(parts[2])
voices.add(parts[1])
# create simple text file for use by IMS-Toucan, only selected data sets
shutil.rmtree("_toucan_ref_audio", ignore_errors=True)
os.mkdir("_toucan_ref_audio")
output_file = f"{ims_toucan_basename}-voices.txt"
for lang in langs:
output_file = f"{ims_toucan_basename}-{lang}.txt"
with open(output_file, "w") as w:
for folder in data_sets:
all_txt = os.path.join(folder, "all.txt")
if not os.path.exists(all_txt):
continue
with open(all_txt) as r:
for line in r:
line = line.strip()
parts = line.split("|")
if parts[2] != lang:
continue
wav = parts[3]
text = parts[6]
wav = os.path.join(folder, wav)
w.write(f"{wav}|{text}\n")
# create simple text file with REF audio
# for use by IMS-Toucan, only selected data sets
voice_files: dict[str, str] = dict()
voice_words: dict[str, int] = dict()
voice_text: dict[str, str] = dict()
for voice in voices:
for folder in data_sets:
all_txt = os.path.join(folder, "all.txt")
if not os.path.exists(all_txt):
continue
with open(all_txt) as r:
for line in r:
line = line.strip()
parts = line.split("|")
if parts[1] != voice:
continue
wav = parts[3]
text = parts[6]
wav = os.path.join(folder, wav)
if voice not in voice_words:
voice_text[voice] = text.strip()
voice_words[voice] = text.strip().count(" ")
voice_files[voice] = wav
word_cnt: int = text.strip().count(" ")
if word_cnt < voice_words[voice]:
continue
voice_words[voice] = word_cnt
voice_files[voice] = wav
voice_text[voice] = text.strip()
with open(output_file, "w") as w:
for voice_file in voice_files.items():
voice: str = voice_file[0]
wav: str = voice_file[1]
text: str = voice_text[voice]
w.write(f"{voice}|{wav}|{text}")
w.write("\n")
ref_audio: AudioSegment = AudioSegment.from_file(wav).set_channels(1)
ref_chunks = split_on_silence(ref_audio, min_silence_len=100, silence_thresh=-40, keep_silence=100)
ref_audio: AudioSegment = AudioSegment.empty()
for ref_chunk in ref_chunks:
ref_audio = ref_audio.append(ref_chunk, crossfade=0)
ref_audio.export(os.path.join("_toucan_ref_audio", f"{voice}.wav"), format="wav")
# shutil.copy(wav, os.path.join("_toucan_ref_audio", f"{voice}.wav"))
if __name__ == '__main__':
main()