diff --git a/ComSpeech/data_preparation/fill_data.py b/ComSpeech/data_preparation/fill_data.py new file mode 100644 index 0000000..9fe6688 --- /dev/null +++ b/ComSpeech/data_preparation/fill_data.py @@ -0,0 +1,81 @@ +import pandas as pd +from examples.speech_to_text.data_utils import ( + load_df_from_tsv, + save_df_to_tsv +) + +database = {} +for lang in ["fr", "de", "es"]: + for split in ["train", "dev", "test"]: + path = f"data/cvss-c/{lang}-en/src/{split}.tsv" + df = load_df_from_tsv(path) + data = (df.T.to_dict().values()) + for item in data: + key = item['id'].replace('.mp3', '') + database[key] = { + 'src_audio': item['src_audio'], + 'src_n_frames': item['src_n_frames'], + } + +for lang in ["ar", "ca", "cy", "de", "es", "et", "fa", "fr", "id", "it", "ja", "lv", "mn", "nl", "pt", "ru", "sl", "sv-SE", "ta", "tr", "zh-CN"]: + for split in ["train", "dev", "test"]: + path = f"data/cvss-c/{lang}-en/tts/{split}.tsv" + df = load_df_from_tsv(path) + data = (df.T.to_dict().values()) + for item in data: + key = item['id'].replace('.mp3', '') + if key in database: + database[key].update({ + 'tgt_audio': item['audio'], + 'tgt_n_frames': item['n_frames'], + 'pitch': item['pitch'], + 'energy': item['energy'], + }) + else: + database[key] = { + 'tgt_audio': item['audio'], + 'tgt_n_frames': item['n_frames'], + 'pitch': item['pitch'], + 'energy': item['energy'], + } + +for lang in ["fr", "de", "es"]: + for split in ["train", "dev", "test", "dev.full", "test.full"]: + path = f"data/comspeech/cvss_{lang}_en/s2s/{split}.tsv" + df = load_df_from_tsv(path) + data = (df.T.to_dict().values()) + for item in data: + key = item['id'].replace('.mp3', '') + item['src_audio'] = database[key]['src_audio'] + item['src_n_frames'] = database[key]['src_n_frames'] + if 'tgt_audio' in item: + item['tgt_audio'] = database[key]['tgt_audio'] + item['tgt_n_frames'] = database[key]['tgt_n_frames'] + item['pitch'] = database[key]['pitch'] + item['energy'] = database[key]['energy'] + df = pd.DataFrame.from_dict(data) + save_df_to_tsv(df, path) + + for split in ["train", "dev", "test"]: + path = f"data/comspeech/cvss_{lang}_en/speech2unigram/{split}.tsv" + df = load_df_from_tsv(path) + data = (df.T.to_dict().values()) + for item in data: + key = item['id'].replace('.mp3', '') + item['audio'] = database[key]['src_audio'] + item['n_frames'] = database[key]['src_n_frames'] + df = pd.DataFrame.from_dict(data) + save_df_to_tsv(df, path) + +for split in ["train", "dev", "test"]: + path = f"data/comspeech/cvss_x_en/tts/{split}.tsv" + df = load_df_from_tsv(path) + data = (df.T.to_dict().values()) + for item in data: + key = item['id'].replace('.mp3', '') + item['audio'] = database[key]['tgt_audio'] + item['n_frames'] = database[key]['tgt_n_frames'] + item['pitch'] = database[key]['pitch'] + item['energy'] = database[key]['energy'] + df = pd.DataFrame.from_dict(data) + save_df_to_tsv(df, path) \ No newline at end of file diff --git a/README.md b/README.md index 3c0fb4d..dea0dd3 100644 --- a/README.md +++ b/README.md @@ -83,6 +83,8 @@ done ```bash for src_lang in ar ca cy de es et fa fr id it ja lv mn nl pt ru sl sv-SE ta tr zh-CN; do + mkdir -p data/cvss-c/${src_lang}-en/mfa_align + tar -xzvf data/cvss-c/${src_lang}-en/mfa.tar.gz -C data/cvss-c/${src_lang}-en/mfa_align/ python ComSpeech/data_preparation/extract_tgt_features.py \ --audio-manifest-root data/cvss-c/${src_lang}-en/ \ --output-root data/cvss-c/${src_lang}-en/tts \ @@ -90,6 +92,12 @@ for src_lang in ar ca cy de es et fa fr id it ja lv mn nl pt ru sl sv-SE ta tr z done ``` +5. Replace the path in files in the `data/comspeech/` directory. + +```bash +python ComSpeech/data_preparation/fill_data.py +``` + ### ComSpeech (Supervised Learning) > [!Note]