Skip to content

Commit

Permalink
update data preparation script
Browse files Browse the repository at this point in the history
  • Loading branch information
Poeroz committed Jul 2, 2024
1 parent f883655 commit 27de17f
Show file tree
Hide file tree
Showing 2 changed files with 89 additions and 0 deletions.
81 changes: 81 additions & 0 deletions ComSpeech/data_preparation/fill_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
import pandas as pd
from examples.speech_to_text.data_utils import (
load_df_from_tsv,
save_df_to_tsv
)

database = {}
for lang in ["fr", "de", "es"]:
for split in ["train", "dev", "test"]:
path = f"data/cvss-c/{lang}-en/src/{split}.tsv"
df = load_df_from_tsv(path)
data = (df.T.to_dict().values())
for item in data:
key = item['id'].replace('.mp3', '')
database[key] = {
'src_audio': item['src_audio'],
'src_n_frames': item['src_n_frames'],
}

for lang in ["ar", "ca", "cy", "de", "es", "et", "fa", "fr", "id", "it", "ja", "lv", "mn", "nl", "pt", "ru", "sl", "sv-SE", "ta", "tr", "zh-CN"]:
for split in ["train", "dev", "test"]:
path = f"data/cvss-c/{lang}-en/tts/{split}.tsv"
df = load_df_from_tsv(path)
data = (df.T.to_dict().values())
for item in data:
key = item['id'].replace('.mp3', '')
if key in database:
database[key].update({
'tgt_audio': item['audio'],
'tgt_n_frames': item['n_frames'],
'pitch': item['pitch'],
'energy': item['energy'],
})
else:
database[key] = {
'tgt_audio': item['audio'],
'tgt_n_frames': item['n_frames'],
'pitch': item['pitch'],
'energy': item['energy'],
}

for lang in ["fr", "de", "es"]:
for split in ["train", "dev", "test", "dev.full", "test.full"]:
path = f"data/comspeech/cvss_{lang}_en/s2s/{split}.tsv"
df = load_df_from_tsv(path)
data = (df.T.to_dict().values())
for item in data:
key = item['id'].replace('.mp3', '')
item['src_audio'] = database[key]['src_audio']
item['src_n_frames'] = database[key]['src_n_frames']
if 'tgt_audio' in item:
item['tgt_audio'] = database[key]['tgt_audio']
item['tgt_n_frames'] = database[key]['tgt_n_frames']
item['pitch'] = database[key]['pitch']
item['energy'] = database[key]['energy']
df = pd.DataFrame.from_dict(data)
save_df_to_tsv(df, path)

for split in ["train", "dev", "test"]:
path = f"data/comspeech/cvss_{lang}_en/speech2unigram/{split}.tsv"
df = load_df_from_tsv(path)
data = (df.T.to_dict().values())
for item in data:
key = item['id'].replace('.mp3', '')
item['audio'] = database[key]['src_audio']
item['n_frames'] = database[key]['src_n_frames']
df = pd.DataFrame.from_dict(data)
save_df_to_tsv(df, path)

for split in ["train", "dev", "test"]:
path = f"data/comspeech/cvss_x_en/tts/{split}.tsv"
df = load_df_from_tsv(path)
data = (df.T.to_dict().values())
for item in data:
key = item['id'].replace('.mp3', '')
item['audio'] = database[key]['tgt_audio']
item['n_frames'] = database[key]['tgt_n_frames']
item['pitch'] = database[key]['pitch']
item['energy'] = database[key]['energy']
df = pd.DataFrame.from_dict(data)
save_df_to_tsv(df, path)
8 changes: 8 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -83,13 +83,21 @@ done

```bash
for src_lang in ar ca cy de es et fa fr id it ja lv mn nl pt ru sl sv-SE ta tr zh-CN; do
mkdir -p data/cvss-c/${src_lang}-en/mfa_align
tar -xzvf data/cvss-c/${src_lang}-en/mfa.tar.gz -C data/cvss-c/${src_lang}-en/mfa_align/
python ComSpeech/data_preparation/extract_tgt_features.py \
--audio-manifest-root data/cvss-c/${src_lang}-en/ \
--output-root data/cvss-c/${src_lang}-en/tts \
--textgrid-dir data/cvss-c/${src_lang}-en/mfa_align/speaker/
done
```

5. Replace the path in files in the `data/comspeech/` directory.

```bash
python ComSpeech/data_preparation/fill_data.py
```

### ComSpeech (Supervised Learning)

> [!Note]
Expand Down

0 comments on commit 27de17f

Please sign in to comment.