From 902ff61976bbecffede41aebcbbd0a993d271fe5 Mon Sep 17 00:00:00 2001
From: dtx525942103 <525942103@qq.com>
Date: Fri, 16 Sep 2022 10:12:29 +0000
Subject: [PATCH] release

---
 README.md                                 |  80 +++
 config/__init__.py                        |   0
 config/shared_configs.py                  | 342 +++++++++
 saved_models/config.json                  | 104 +++
 speaker_encoder/README.md                 |  18 +
 speaker_encoder/__init__.py               |   0
 speaker_encoder/configs/config.json       | 104 +++
 speaker_encoder/dataset.py                | 253 +++++++
 speaker_encoder/losses.py                 | 220 ++++++
 speaker_encoder/models/lstm.py            | 131 ++++
 speaker_encoder/models/resnet.py          | 212 ++++++
 speaker_encoder/requirements.txt          |   2 +
 speaker_encoder/speaker_encoder_config.py |  65 ++
 speaker_encoder/umap.png                  | Bin 0 -> 24616 bytes
 speaker_encoder/utils/__init__.py         |   0
 speaker_encoder/utils/generic_utils.py    | 220 ++++++
 speaker_encoder/utils/io.py               |  38 +
 speaker_encoder/utils/prepare_voxceleb.py | 219 ++++++
 speaker_encoder/utils/visual.py           |  46 ++
 utils/__init__.py                         |   0
 utils/audio.py                            | 822 ++++++++++++++++++++++
 utils/io.py                               | 198 ++++++
 vi_speaker_batch.py                       |  88 +++
 vi_speaker_center.py                      |  21 +
 vi_speaker_single.py                      | 109 +++
 25 files changed, 3292 insertions(+)
 create mode 100644 config/__init__.py
 create mode 100644 config/shared_configs.py
 create mode 100644 saved_models/config.json
 create mode 100644 speaker_encoder/README.md
 create mode 100644 speaker_encoder/__init__.py
 create mode 100644 speaker_encoder/configs/config.json
 create mode 100644 speaker_encoder/dataset.py
 create mode 100644 speaker_encoder/losses.py
 create mode 100644 speaker_encoder/models/lstm.py
 create mode 100644 speaker_encoder/models/resnet.py
 create mode 100644 speaker_encoder/requirements.txt
 create mode 100644 speaker_encoder/speaker_encoder_config.py
 create mode 100644 speaker_encoder/umap.png
 create mode 100644 speaker_encoder/utils/__init__.py
 create mode 100644 speaker_encoder/utils/generic_utils.py
 create mode 100644 speaker_encoder/utils/io.py
 create mode 100644 speaker_encoder/utils/prepare_voxceleb.py
 create mode 100644 speaker_encoder/utils/visual.py
 create mode 100644 utils/__init__.py
 create mode 100644 utils/audio.py
 create mode 100644 utils/io.py
 create mode 100644 vi_speaker_batch.py
 create mode 100644 vi_speaker_center.py
 create mode 100644 vi_speaker_single.py

diff --git a/README.md b/README.md
index 94f2dff..3d508bc 100644
--- a/README.md
+++ b/README.md
@@ -1,2 +1,82 @@
 # VI-Speaker
 Speaker embedding for VI-SVC and VI-SVS, alse for VITS; Use this to replace the ID to implement voice clone.
+
+# code from mozill_tts and Coqpit/TTS
+https://github.com/mozilla/TTS/tree/master/TTS/speaker_encoder
+
+pip install coqpit
+
+# download model，or get it at **release**
+https://github.com/mozilla/TTS/wiki/Released-Models
+
+Speaker-Encoder by @mueller91	LibriTTS + VCTK + VoxCeleb + CommonVoice
+
+# please read the config
+https://drive.google.com/drive/folders/15oeBYf6Qn1edONkVLXe82MzdIi3O_9m3
+
+# use
+python vi_speaker_single.py ./saved_models/best_model.pth.tar ./saved_models/config.json -s TEST.wav -t TEST.npy
+
+# batch use
+python vi_speaker_batch.py ./saved_models/best_model.pth.tar ./saved_models/config.json ./data/waves ./speaker_embedding
+
+data/
+└── waves
+    ├── spk1
+    │   ├── 000002.wav
+    │   ├── 000006.wav
+    │   └── 000038.wav
+    └── spk2
+        ├── 000040.wav
+        ├── 000044.wav
+        └── 000077.wav
+
+speaker_embedding/
+├── spk1
+│   ├── 000002.npy
+│   ├── 000006.npy
+│   └── 000038.npy
+└── spk2
+    ├── 000040.npy
+    ├── 000044.npy
+    └── 000077.npy
+
+# compute speaker center
+input path = speaker_embedding, output path = speaker_embedding_center
+
+python vi_speaker_center.py
+
+speaker_embedding_center/
+├── spk1.npy
+└── spk2.npy
+
+
+# for VI-SVC
+mv speaker_embedding_center data/spkid
+
+data/
+├── waves
+│   ├── 10001
+│   ├── 20400
+│   │   ├── 20400_001.wav
+│   │   ├── 20456_019.wav
+│   │   
+├── phone
+│   ├── 10001
+│   ├── 20400
+│   │   ├── 20400_001.npy
+│   │   ├── 20456_019.npy
+│   │   
+├── lable
+│   ├── 10001
+│   ├── 20400
+│   │   ├── 20400_001.npy
+│   │   ├── 20456_019.npy
+│   │   
+├── spkid
+│   ├── 10001.npy
+│   ├── 20400.npy
+│   │   
+
+
+
diff --git a/config/__init__.py b/config/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/config/shared_configs.py b/config/shared_configs.py
new file mode 100644
index 0000000..d91bf2b
--- /dev/null
+++ b/config/shared_configs.py
@@ -0,0 +1,342 @@
+from dataclasses import asdict, dataclass
+from typing import List
+
+from coqpit import Coqpit, check_argument
+
+
+@dataclass
+class BaseAudioConfig(Coqpit):
+    """Base config to definge audio processing parameters. It is used to initialize
+    ```TTS.utils.audio.AudioProcessor.```
+
+    Args:
+        fft_size (int):
+            Number of STFT frequency levels aka.size of the linear spectogram frame. Defaults to 1024.
+
+        win_length (int):
+            Each frame of audio is windowed by window of length ```win_length``` and then padded with zeros to match
+            ```fft_size```. Defaults to 1024.
+
+        hop_length (int):
+            Number of audio samples between adjacent STFT columns. Defaults to 1024.
+
+        frame_shift_ms (int):
+            Set ```hop_length``` based on milliseconds and sampling rate.
+
+        frame_length_ms (int):
+            Set ```win_length``` based on milliseconds and sampling rate.
+
+        stft_pad_mode (str):
+            Padding method used in STFT. 'reflect' or 'center'. Defaults to 'reflect'.
+
+        sample_rate (int):
+            Audio sampling rate. Defaults to 22050.
+
+        resample (bool):
+            Enable / Disable resampling audio to ```sample_rate```. Defaults to ```False```.
+
+        preemphasis (float):
+            Preemphasis coefficient. Defaults to 0.0.
+
+        ref_level_db (int): 20
+            Reference Db level to rebase the audio signal and ignore the level below. 20Db is assumed the sound of air.
+            Defaults to 20.
+
+        do_sound_norm (bool):
+            Enable / Disable sound normalization to reconcile the volume differences among samples. Defaults to False.
+
+        log_func (str):
+            Numpy log function used for amplitude to DB conversion. Defaults to 'np.log10'.
+
+        do_trim_silence (bool):
+            Enable / Disable trimming silences at the beginning and the end of the audio clip. Defaults to ```True```.
+
+        do_amp_to_db_linear (bool, optional):
+            enable/disable amplitude to dB conversion of linear spectrograms. Defaults to True.
+
+        do_amp_to_db_mel (bool, optional):
+            enable/disable amplitude to dB conversion of mel spectrograms. Defaults to True.
+
+        trim_db (int):
+            Silence threshold used for silence trimming. Defaults to 45.
+
+        power (float):
+            Exponent used for expanding spectrogra levels before running Griffin Lim. It helps to reduce the
+            artifacts in the synthesized voice. Defaults to 1.5.
+
+        griffin_lim_iters (int):
+            Number of Griffing Lim iterations. Defaults to 60.
+
+        num_mels (int):
+            Number of mel-basis frames that defines the frame lengths of each mel-spectrogram frame. Defaults to 80.
+
+        mel_fmin (float): Min frequency level used for the mel-basis filters. ~50 for male and ~95 for female voices.
+            It needs to be adjusted for a dataset. Defaults to 0.
+
+        mel_fmax (float):
+            Max frequency level used for the mel-basis filters. It needs to be adjusted for a dataset.
+
+        spec_gain (int):
+            Gain applied when converting amplitude to DB. Defaults to 20.
+
+        signal_norm (bool):
+            enable/disable signal normalization. Defaults to True.
+
+        min_level_db (int):
+            minimum db threshold for the computed melspectrograms. Defaults to -100.
+
+        symmetric_norm (bool):
+            enable/disable symmetric normalization. If set True normalization is performed in the range [-k, k] else
+            [0, k], Defaults to True.
+
+        max_norm (float):
+            ```k``` defining the normalization range. Defaults to 4.0.
+
+        clip_norm (bool):
+            enable/disable clipping the our of range values in the normalized audio signal. Defaults to True.
+
+        stats_path (str):
+            Path to the computed stats file. Defaults to None.
+    """
+
+    # stft parameters
+    fft_size: int = 1024
+    win_length: int = 1024
+    hop_length: int = 256
+    frame_shift_ms: int = None
+    frame_length_ms: int = None
+    stft_pad_mode: str = "reflect"
+    # audio processing parameters
+    sample_rate: int = 22050
+    resample: bool = False
+    preemphasis: float = 0.0
+    ref_level_db: int = 20
+    do_sound_norm: bool = False
+    log_func: str = "np.log10"
+    # silence trimming
+    do_trim_silence: bool = True
+    trim_db: int = 45
+    # griffin-lim params
+    power: float = 1.5
+    griffin_lim_iters: int = 60
+    # mel-spec params
+    num_mels: int = 80
+    mel_fmin: float = 0.0
+    mel_fmax: float = None
+    spec_gain: int = 20
+    do_amp_to_db_linear: bool = True
+    do_amp_to_db_mel: bool = True
+    # normalization params
+    signal_norm: bool = True
+    min_level_db: int = -100
+    symmetric_norm: bool = True
+    max_norm: float = 4.0
+    clip_norm: bool = True
+    stats_path: str = None
+
+    def check_values(
+        self,
+    ):
+        """Check config fields"""
+        c = asdict(self)
+        check_argument("num_mels", c, restricted=True, min_val=10, max_val=2056)
+        check_argument("fft_size", c, restricted=True, min_val=128, max_val=4058)
+        check_argument("sample_rate", c, restricted=True, min_val=512, max_val=100000)
+        check_argument(
+            "frame_length_ms",
+            c,
+            restricted=True,
+            min_val=10,
+            max_val=1000,
+            alternative="win_length",
+        )
+        check_argument("frame_shift_ms", c, restricted=True, min_val=1, max_val=1000, alternative="hop_length")
+        check_argument("preemphasis", c, restricted=True, min_val=0, max_val=1)
+        check_argument("min_level_db", c, restricted=True, min_val=-1000, max_val=10)
+        check_argument("ref_level_db", c, restricted=True, min_val=0, max_val=1000)
+        check_argument("power", c, restricted=True, min_val=1, max_val=5)
+        check_argument("griffin_lim_iters", c, restricted=True, min_val=10, max_val=1000)
+
+        # normalization parameters
+        check_argument("signal_norm", c, restricted=True)
+        check_argument("symmetric_norm", c, restricted=True)
+        check_argument("max_norm", c, restricted=True, min_val=0.1, max_val=1000)
+        check_argument("clip_norm", c, restricted=True)
+        check_argument("mel_fmin", c, restricted=True, min_val=0.0, max_val=1000)
+        check_argument("mel_fmax", c, restricted=True, min_val=500.0, allow_none=True)
+        check_argument("spec_gain", c, restricted=True, min_val=1, max_val=100)
+        check_argument("do_trim_silence", c, restricted=True)
+        check_argument("trim_db", c, restricted=True)
+
+
+@dataclass
+class BaseDatasetConfig(Coqpit):
+    """Base config for TTS datasets.
+
+    Args:
+        name (str):
+            Dataset name that defines the preprocessor in use. Defaults to None.
+
+        path (str):
+            Root path to the dataset files. Defaults to None.
+
+        meta_file_train (str):
+            Name of the dataset meta file. Or a list of speakers to be ignored at training for multi-speaker datasets.
+            Defaults to None.
+
+        unused_speakers (List):
+            List of speakers IDs that are not used at the training. Default None.
+
+        meta_file_val (str):
+            Name of the dataset meta file that defines the instances used at validation.
+
+        meta_file_attn_mask (str):
+            Path to the file that lists the attention mask files used with models that require attention masks to
+            train the duration predictor.
+    """
+
+    name: str = ""
+    path: str = ""
+    meta_file_train: str = ""
+    ununsed_speakers: List[str] = None
+    meta_file_val: str = ""
+    meta_file_attn_mask: str = ""
+
+    def check_values(
+        self,
+    ):
+        """Check config fields"""
+        c = asdict(self)
+        check_argument("name", c, restricted=True)
+        check_argument("path", c, restricted=True)
+        check_argument("meta_file_train", c, restricted=True)
+        check_argument("meta_file_val", c, restricted=False)
+        check_argument("meta_file_attn_mask", c, restricted=False)
+
+
+@dataclass
+class BaseTrainingConfig(Coqpit):
+    """Base config to define the basic training parameters that are shared
+    among all the models.
+
+    Args:
+        model (str):
+            Name of the model that is used in the training.
+
+        run_name (str):
+            Name of the experiment. This prefixes the output folder name. Defaults to `coqui_tts`.
+
+        run_description (str):
+            Short description of the experiment.
+
+        epochs (int):
+            Number training epochs. Defaults to 10000.
+
+        batch_size (int):
+            Training batch size.
+
+        eval_batch_size (int):
+            Validation batch size.
+
+        mixed_precision (bool):
+            Enable / Disable mixed precision training. It reduces the VRAM use and allows larger batch sizes, however
+            it may also cause numerical unstability in some cases.
+
+        scheduler_after_epoch (bool):
+            If true, run the scheduler step after each epoch else run it after each model step.
+
+        run_eval (bool):
+            Enable / Disable evaluation (validation) run. Defaults to True.
+
+        test_delay_epochs (int):
+            Number of epochs before starting to use evaluation runs. Initially, models do not generate meaningful
+            results, hence waiting for a couple of epochs might save some time.
+
+        print_eval (bool):
+            Enable / Disable console logging for evalutaion steps. If disabled then it only shows the final values at
+            the end of the evaluation. Default to ```False```.
+
+        print_step (int):
+            Number of steps required to print the next training log.
+
+        log_dashboard (str): "tensorboard" or "wandb"
+            Set the experiment tracking tool
+
+        plot_step (int):
+            Number of steps required to log training on Tensorboard.
+
+        model_param_stats (bool):
+            Enable / Disable logging internal model stats for model diagnostic. It might be useful for model debugging.
+            Defaults to ```False```.
+
+        project_name (str):
+            Name of the project. Defaults to config.model
+
+        wandb_entity (str):
+            Name of W&B entity/team. Enables collaboration across a team or org.
+
+        log_model_step (int):
+            Number of steps required to log a checkpoint as W&B artifact
+
+        save_step (int):ipt
+            Number of steps required to save the next checkpoint.
+
+        checkpoint (bool):
+            Enable / Disable checkpointing.
+
+        keep_all_best (bool):
+            Enable / Disable keeping all the saved best models instead of overwriting the previous one. Defaults
+            to ```False```.
+
+        keep_after (int):
+            Number of steps to wait before saving all the best models. In use if ```keep_all_best == True```. Defaults
+            to 10000.
+
+        num_loader_workers (int):
+            Number of workers for training time dataloader.
+
+        num_eval_loader_workers (int):
+            Number of workers for evaluation time dataloader.
+
+        output_path (str):
+            Path for training output folder, either a local file path or other
+            URLs supported by both fsspec and tensorboardX, e.g. GCS (gs://) or
+            S3 (s3://) paths. The nonexist part of the given path is created
+            automatically. All training artefacts are saved there.
+    """
+
+    model: str = None
+    run_name: str = "coqui_tts"
+    run_description: str = ""
+    # training params
+    epochs: int = 10000
+    batch_size: int = None
+    eval_batch_size: int = None
+    mixed_precision: bool = False
+    scheduler_after_epoch: bool = False
+    # eval params
+    run_eval: bool = True
+    test_delay_epochs: int = 0
+    print_eval: bool = False
+    # logging
+    dashboard_logger: str = "tensorboard"
+    print_step: int = 25
+    plot_step: int = 100
+    model_param_stats: bool = False
+    project_name: str = None
+    log_model_step: int = None
+    wandb_entity: str = None
+    # checkpointing
+    save_step: int = 10000
+    checkpoint: bool = True
+    keep_all_best: bool = False
+    keep_after: int = 10000
+    # dataloading
+    num_loader_workers: int = 0
+    num_eval_loader_workers: int = 0
+    use_noise_augment: bool = False
+    # paths
+    output_path: str = None
+    # distributed
+    distributed_backend: str = "nccl"
+    distributed_url: str = "tcp://localhost:54321"
diff --git a/saved_models/config.json b/saved_models/config.json
new file mode 100644
index 0000000..e330aab
--- /dev/null
+++ b/saved_models/config.json
@@ -0,0 +1,104 @@
+{
+    "model_name": "lstm",
+    "run_name": "mueller91",
+    "run_description": "train speaker encoder with voxceleb1, voxceleb2 and libriSpeech ",
+    "audio":{
+        // Audio processing parameters
+        "num_mels": 80,         // size of the mel spec frame.
+        "fft_size": 1024,       // number of stft frequency levels. Size of the linear spectogram frame.
+        "sample_rate": 16000,   // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled.
+        "win_length": 1024,     // stft window length in ms.
+        "hop_length": 256,      // stft window hop-lengh in ms.
+        "frame_length_ms": null,  // stft window length in ms.If null, 'win_length' is used.
+        "frame_shift_ms": null,   // stft window hop-lengh in ms. If null, 'hop_length' is used.
+        "preemphasis": 0.98,    // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
+        "min_level_db": -100,   // normalization range
+        "ref_level_db": 20,     // reference level db, theoretically 20db is the sound of air.
+        "power": 1.5,           // value to sharpen wav signals after GL algorithm.
+        "griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation.
+        // Normalization parameters
+        "signal_norm": true,    // normalize the spec values in range [0, 1]
+        "symmetric_norm": true, // move normalization to range [-1, 1]
+        "max_norm": 4.0,          // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
+        "clip_norm": true,      // clip normalized values into the range.
+        "mel_fmin": 0.0,         // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
+        "mel_fmax": 8000.0,        // maximum freq level for mel-spec. Tune for dataset!!
+        "do_trim_silence": true,  // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true)
+        "trim_db": 60          // threshold for timming silence. Set this according to your dataset.
+    },
+    "reinit_layers": [],
+    "loss": "angleproto", // "ge2e" to use Generalized End-to-End loss and "angleproto" to use Angular Prototypical loss (new SOTA)
+    "grad_clip": 3.0, // upper limit for gradients for clipping.
+    "epochs": 1000, // total number of epochs to train.
+    "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate.
+    "lr_decay": false, // if true, Noam learning rate decaying is applied through training.
+    "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr"
+    "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. 
+    "steps_plot_stats": 10, // number of steps to plot embeddings.
+    "num_speakers_in_batch": 64, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
+    "voice_len": 2.0, // size of the voice
+    "num_utters_per_speaker": 10,  //
+    "num_loader_workers": 8,        // number of training data loader processes. Don't set it too big. 4-8 are good values.
+    "wd": 0.000001, // Weight decay weight.
+    "checkpoint": true, // If true, it saves checkpoints per "save_step"
+    "save_step": 1000, // Number of training steps expected to save traning stats and checkpoints.
+    "print_step": 20, // Number of steps to log traning on console.
+    "output_path": "../../OutputsMozilla/checkpoints/speaker_encoder/", // DATASET-RELATED: output path for all training outputs.
+    "model": {
+        "input_dim": 80,
+        "proj_dim": 256,
+        "lstm_dim": 768,
+        "num_lstm_layers": 3,
+        "use_lstm_with_projection": true
+    },
+    "storage": {
+        "sample_from_storage_p": 0.9,  // the probability with which we'll sample from the DataSet in-memory storage
+        "storage_size": 25,   // the size of the in-memory storage with respect to a single batch
+        "additive_noise": 1e-5   // add very small gaussian noise to the data in order to increase robustness
+    },
+    "datasets": 
+        [
+            {
+                "name": "vctk_slim",
+                "path": "../../../audio-datasets/en/VCTK-Corpus/",
+                "meta_file_train": null,
+                "meta_file_val": null
+            },
+            {
+                "name": "libri_tts",
+                "path": "../../../audio-datasets/en/LibriTTS/train-clean-100",
+                "meta_file_train": null,
+                "meta_file_val": null
+            },
+            {
+                "name": "libri_tts",
+                "path": "../../../audio-datasets/en/LibriTTS/train-clean-360",
+                "meta_file_train": null,
+                "meta_file_val": null
+            },
+            {
+                "name": "libri_tts",
+                "path": "../../../audio-datasets/en/LibriTTS/train-other-500",
+                "meta_file_train": null,
+                "meta_file_val": null
+            },
+            {
+                "name": "voxceleb1",
+                "path": "../../../audio-datasets/en/voxceleb1/",
+                "meta_file_train": null,
+                "meta_file_val": null
+            },
+            {
+                "name": "voxceleb2",
+                "path": "../../../audio-datasets/en/voxceleb2/",
+                "meta_file_train": null,
+                "meta_file_val": null
+            },
+            {
+                "name": "common_voice",
+                "path": "../../../audio-datasets/en/MozillaCommonVoice",
+                "meta_file_train": "train.tsv",
+                "meta_file_val": "test.tsv"
+            }
+        ]
+}
\ No newline at end of file
diff --git a/speaker_encoder/README.md b/speaker_encoder/README.md
new file mode 100644
index 0000000..b6f541f
--- /dev/null
+++ b/speaker_encoder/README.md
@@ -0,0 +1,18 @@
+### Speaker Encoder
+
+This is an implementation of https://arxiv.org/abs/1710.10467. This model can be used for voice and speaker embedding.
+
+With the code here you can generate d-vectors for both multi-speaker and single-speaker TTS datasets, then visualise and explore them along with the associated audio files in an interactive chart.
+
+Below is an example showing embedding results of various speakers. You can generate the same plot with the provided notebook as demonstrated in [this video](https://youtu.be/KW3oO7JVa7Q).
+
+![](umap.png)
+
+Download a pretrained model from [Released Models](https://github.com/mozilla/TTS/wiki/Released-Models) page.
+
+To run the code, you need to follow the same flow as in TTS.
+
+- Define 'config.json' for your needs. Note that, audio parameters should match your TTS model.
+- Example training call ```python speaker_encoder/train.py --config_path speaker_encoder/config.json --data_path ~/Data/Libri-TTS/train-clean-360```
+- Generate embedding vectors ```python speaker_encoder/compute_embeddings.py --use_cuda true /model/path/best_model.pth.tar model/config/path/config.json dataset/path/ output_path``` . This code parses all .wav files at the given dataset path and generates the same folder structure under the output path with the generated embedding files.
+- Watch training on Tensorboard as in TTS
diff --git a/speaker_encoder/__init__.py b/speaker_encoder/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/speaker_encoder/configs/config.json b/speaker_encoder/configs/config.json
new file mode 100644
index 0000000..e330aab
--- /dev/null
+++ b/speaker_encoder/configs/config.json
@@ -0,0 +1,104 @@
+{
+    "model_name": "lstm",
+    "run_name": "mueller91",
+    "run_description": "train speaker encoder with voxceleb1, voxceleb2 and libriSpeech ",
+    "audio":{
+        // Audio processing parameters
+        "num_mels": 80,         // size of the mel spec frame.
+        "fft_size": 1024,       // number of stft frequency levels. Size of the linear spectogram frame.
+        "sample_rate": 16000,   // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled.
+        "win_length": 1024,     // stft window length in ms.
+        "hop_length": 256,      // stft window hop-lengh in ms.
+        "frame_length_ms": null,  // stft window length in ms.If null, 'win_length' is used.
+        "frame_shift_ms": null,   // stft window hop-lengh in ms. If null, 'hop_length' is used.
+        "preemphasis": 0.98,    // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
+        "min_level_db": -100,   // normalization range
+        "ref_level_db": 20,     // reference level db, theoretically 20db is the sound of air.
+        "power": 1.5,           // value to sharpen wav signals after GL algorithm.
+        "griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation.
+        // Normalization parameters
+        "signal_norm": true,    // normalize the spec values in range [0, 1]
+        "symmetric_norm": true, // move normalization to range [-1, 1]
+        "max_norm": 4.0,          // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
+        "clip_norm": true,      // clip normalized values into the range.
+        "mel_fmin": 0.0,         // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
+        "mel_fmax": 8000.0,        // maximum freq level for mel-spec. Tune for dataset!!
+        "do_trim_silence": true,  // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true)
+        "trim_db": 60          // threshold for timming silence. Set this according to your dataset.
+    },
+    "reinit_layers": [],
+    "loss": "angleproto", // "ge2e" to use Generalized End-to-End loss and "angleproto" to use Angular Prototypical loss (new SOTA)
+    "grad_clip": 3.0, // upper limit for gradients for clipping.
+    "epochs": 1000, // total number of epochs to train.
+    "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate.
+    "lr_decay": false, // if true, Noam learning rate decaying is applied through training.
+    "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr"
+    "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. 
+    "steps_plot_stats": 10, // number of steps to plot embeddings.
+    "num_speakers_in_batch": 64, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
+    "voice_len": 2.0, // size of the voice
+    "num_utters_per_speaker": 10,  //
+    "num_loader_workers": 8,        // number of training data loader processes. Don't set it too big. 4-8 are good values.
+    "wd": 0.000001, // Weight decay weight.
+    "checkpoint": true, // If true, it saves checkpoints per "save_step"
+    "save_step": 1000, // Number of training steps expected to save traning stats and checkpoints.
+    "print_step": 20, // Number of steps to log traning on console.
+    "output_path": "../../OutputsMozilla/checkpoints/speaker_encoder/", // DATASET-RELATED: output path for all training outputs.
+    "model": {
+        "input_dim": 80,
+        "proj_dim": 256,
+        "lstm_dim": 768,
+        "num_lstm_layers": 3,
+        "use_lstm_with_projection": true
+    },
+    "storage": {
+        "sample_from_storage_p": 0.9,  // the probability with which we'll sample from the DataSet in-memory storage
+        "storage_size": 25,   // the size of the in-memory storage with respect to a single batch
+        "additive_noise": 1e-5   // add very small gaussian noise to the data in order to increase robustness
+    },
+    "datasets": 
+        [
+            {
+                "name": "vctk_slim",
+                "path": "../../../audio-datasets/en/VCTK-Corpus/",
+                "meta_file_train": null,
+                "meta_file_val": null
+            },
+            {
+                "name": "libri_tts",
+                "path": "../../../audio-datasets/en/LibriTTS/train-clean-100",
+                "meta_file_train": null,
+                "meta_file_val": null
+            },
+            {
+                "name": "libri_tts",
+                "path": "../../../audio-datasets/en/LibriTTS/train-clean-360",
+                "meta_file_train": null,
+                "meta_file_val": null
+            },
+            {
+                "name": "libri_tts",
+                "path": "../../../audio-datasets/en/LibriTTS/train-other-500",
+                "meta_file_train": null,
+                "meta_file_val": null
+            },
+            {
+                "name": "voxceleb1",
+                "path": "../../../audio-datasets/en/voxceleb1/",
+                "meta_file_train": null,
+                "meta_file_val": null
+            },
+            {
+                "name": "voxceleb2",
+                "path": "../../../audio-datasets/en/voxceleb2/",
+                "meta_file_train": null,
+                "meta_file_val": null
+            },
+            {
+                "name": "common_voice",
+                "path": "../../../audio-datasets/en/MozillaCommonVoice",
+                "meta_file_train": "train.tsv",
+                "meta_file_val": "test.tsv"
+            }
+        ]
+}
\ No newline at end of file
diff --git a/speaker_encoder/dataset.py b/speaker_encoder/dataset.py
new file mode 100644
index 0000000..6b2b0dd
--- /dev/null
+++ b/speaker_encoder/dataset.py
@@ -0,0 +1,253 @@
+import random
+
+import numpy as np
+import torch
+from torch.utils.data import Dataset
+
+from TTS.speaker_encoder.utils.generic_utils import AugmentWAV, Storage
+
+
+class SpeakerEncoderDataset(Dataset):
+    def __init__(
+        self,
+        ap,
+        meta_data,
+        voice_len=1.6,
+        num_speakers_in_batch=64,
+        storage_size=1,
+        sample_from_storage_p=0.5,
+        num_utter_per_speaker=10,
+        skip_speakers=False,
+        verbose=False,
+        augmentation_config=None,
+    ):
+        """
+        Args:
+            ap (TTS.tts.utils.AudioProcessor): audio processor object.
+            meta_data (list): list of dataset instances.
+            seq_len (int): voice segment length in seconds.
+            verbose (bool): print diagnostic information.
+        """
+        super().__init__()
+        self.items = meta_data
+        self.sample_rate = ap.sample_rate
+        self.seq_len = int(voice_len * self.sample_rate)
+        self.num_speakers_in_batch = num_speakers_in_batch
+        self.num_utter_per_speaker = num_utter_per_speaker
+        self.skip_speakers = skip_speakers
+        self.ap = ap
+        self.verbose = verbose
+        self.__parse_items()
+        storage_max_size = storage_size * num_speakers_in_batch
+        self.storage = Storage(
+            maxsize=storage_max_size, storage_batchs=storage_size, num_speakers_in_batch=num_speakers_in_batch
+        )
+        self.sample_from_storage_p = float(sample_from_storage_p)
+
+        speakers_aux = list(self.speakers)
+        speakers_aux.sort()
+        self.speakerid_to_classid = {key: i for i, key in enumerate(speakers_aux)}
+
+        # Augmentation
+        self.augmentator = None
+        self.gaussian_augmentation_config = None
+        if augmentation_config:
+            self.data_augmentation_p = augmentation_config["p"]
+            if self.data_augmentation_p and ("additive" in augmentation_config or "rir" in augmentation_config):
+                self.augmentator = AugmentWAV(ap, augmentation_config)
+
+            if "gaussian" in augmentation_config.keys():
+                self.gaussian_augmentation_config = augmentation_config["gaussian"]
+
+        if self.verbose:
+            print("\n > DataLoader initialization")
+            print(f" | > Speakers per Batch: {num_speakers_in_batch}")
+            print(f" | > Storage Size: {storage_max_size} instances, each with {num_utter_per_speaker} utters")
+            print(f" | > Sample_from_storage_p : {self.sample_from_storage_p}")
+            print(f" | > Number of instances : {len(self.items)}")
+            print(f" | > Sequence length: {self.seq_len}")
+            print(f" | > Num speakers: {len(self.speakers)}")
+
+    def load_wav(self, filename):
+        audio = self.ap.load_wav(filename, sr=self.ap.sample_rate)
+        return audio
+
+    def load_data(self, idx):
+        text, wav_file, speaker_name = self.items[idx]
+        wav = np.asarray(self.load_wav(wav_file), dtype=np.float32)
+        mel = self.ap.melspectrogram(wav).astype("float32")
+        # sample seq_len
+
+        assert text.size > 0, self.items[idx][1]
+        assert wav.size > 0, self.items[idx][1]
+
+        sample = {
+            "mel": mel,
+            "item_idx": self.items[idx][1],
+            "speaker_name": speaker_name,
+        }
+        return sample
+
+    def __parse_items(self):
+        self.speaker_to_utters = {}
+        for i in self.items:
+            path_ = i[1]
+            speaker_ = i[2]
+            if speaker_ in self.speaker_to_utters.keys():
+                self.speaker_to_utters[speaker_].append(path_)
+            else:
+                self.speaker_to_utters[speaker_] = [
+                    path_,
+                ]
+
+        if self.skip_speakers:
+            self.speaker_to_utters = {
+                k: v for (k, v) in self.speaker_to_utters.items() if len(v) >= self.num_utter_per_speaker
+            }
+
+        self.speakers = [k for (k, v) in self.speaker_to_utters.items()]
+
+    def __len__(self):
+        return int(1e10)
+
+    def get_num_speakers(self):
+        return len(self.speakers)
+
+    def __sample_speaker(self, ignore_speakers=None):
+        speaker = random.sample(self.speakers, 1)[0]
+        # if list of speakers_id is provide make sure that it's will be ignored
+        if ignore_speakers and self.speakerid_to_classid[speaker] in ignore_speakers:
+            while True:
+                speaker = random.sample(self.speakers, 1)[0]
+                if self.speakerid_to_classid[speaker] not in ignore_speakers:
+                    break
+
+        if self.num_utter_per_speaker > len(self.speaker_to_utters[speaker]):
+            utters = random.choices(self.speaker_to_utters[speaker], k=self.num_utter_per_speaker)
+        else:
+            utters = random.sample(self.speaker_to_utters[speaker], self.num_utter_per_speaker)
+        return speaker, utters
+
+    def __sample_speaker_utterances(self, speaker):
+        """
+        Sample all M utterances for the given speaker.
+        """
+        wavs = []
+        labels = []
+        for _ in range(self.num_utter_per_speaker):
+            # TODO:dummy but works
+            while True:
+                # remove speakers that have num_utter less than 2
+                if len(self.speaker_to_utters[speaker]) > 1:
+                    utter = random.sample(self.speaker_to_utters[speaker], 1)[0]
+                else:
+                    if speaker in self.speakers:
+                        self.speakers.remove(speaker)
+
+                    speaker, _ = self.__sample_speaker()
+                    continue
+
+                wav = self.load_wav(utter)
+                if wav.shape[0] - self.seq_len > 0:
+                    break
+
+                if utter in self.speaker_to_utters[speaker]:
+                    self.speaker_to_utters[speaker].remove(utter)
+
+            if self.augmentator is not None and self.data_augmentation_p:
+                if random.random() < self.data_augmentation_p:
+                    wav = self.augmentator.apply_one(wav)
+
+            wavs.append(wav)
+            labels.append(self.speakerid_to_classid[speaker])
+        return wavs, labels
+
+    def __getitem__(self, idx):
+        speaker, _ = self.__sample_speaker()
+        speaker_id = self.speakerid_to_classid[speaker]
+        return speaker, speaker_id
+
+    def __load_from_disk_and_storage(self, speaker):
+        # don't sample from storage, but from HDD
+        wavs_, labels_ = self.__sample_speaker_utterances(speaker)
+        # put the newly loaded item into storage
+        self.storage.append((wavs_, labels_))
+        return wavs_, labels_
+
+    def collate_fn(self, batch):
+        # get the batch speaker_ids
+        batch = np.array(batch)
+        speakers_id_in_batch = set(batch[:, 1].astype(np.int32))
+
+        labels = []
+        feats = []
+        speakers = set()
+
+        for speaker, speaker_id in batch:
+            speaker_id = int(speaker_id)
+
+            # ensure that an speaker appears only once in the batch
+            if speaker_id in speakers:
+
+                # remove current speaker
+                if speaker_id in speakers_id_in_batch:
+                    speakers_id_in_batch.remove(speaker_id)
+
+                speaker, _ = self.__sample_speaker(ignore_speakers=speakers_id_in_batch)
+                speaker_id = self.speakerid_to_classid[speaker]
+                speakers_id_in_batch.add(speaker_id)
+
+            if random.random() < self.sample_from_storage_p and self.storage.full():
+                # sample from storage (if full)
+                wavs_, labels_ = self.storage.get_random_sample_fast()
+
+                # force choose the current speaker or other not in batch
+                # It's necessary for ideal training with AngleProto and GE2E losses
+                if labels_[0] in speakers_id_in_batch and labels_[0] != speaker_id:
+                    attempts = 0
+                    while True:
+                        wavs_, labels_ = self.storage.get_random_sample_fast()
+                        if labels_[0] == speaker_id or labels_[0] not in speakers_id_in_batch:
+                            break
+
+                        attempts += 1
+                        # Try 5 times after that load from disk
+                        if attempts >= 5:
+                            wavs_, labels_ = self.__load_from_disk_and_storage(speaker)
+                            break
+            else:
+                # don't sample from storage, but from HDD
+                wavs_, labels_ = self.__load_from_disk_and_storage(speaker)
+
+            # append speaker for control
+            speakers.add(labels_[0])
+
+            # remove current speaker and append other
+            if speaker_id in speakers_id_in_batch:
+                speakers_id_in_batch.remove(speaker_id)
+
+            speakers_id_in_batch.add(labels_[0])
+
+            # get a random subset of each of the wavs and extract mel spectrograms.
+            feats_ = []
+            for wav in wavs_:
+                offset = random.randint(0, wav.shape[0] - self.seq_len)
+                wav = wav[offset : offset + self.seq_len]
+                # add random gaussian noise
+                if self.gaussian_augmentation_config and self.gaussian_augmentation_config["p"]:
+                    if random.random() < self.gaussian_augmentation_config["p"]:
+                        wav += np.random.normal(
+                            self.gaussian_augmentation_config["min_amplitude"],
+                            self.gaussian_augmentation_config["max_amplitude"],
+                            size=len(wav),
+                        )
+                mel = self.ap.melspectrogram(wav)
+                feats_.append(torch.FloatTensor(mel))
+
+            labels.append(torch.LongTensor(labels_))
+            feats.extend(feats_)
+
+        feats = torch.stack(feats)
+        labels = torch.stack(labels)
+
+        return feats.transpose(1, 2), labels
diff --git a/speaker_encoder/losses.py b/speaker_encoder/losses.py
new file mode 100644
index 0000000..8ba917b
--- /dev/null
+++ b/speaker_encoder/losses.py
@@ -0,0 +1,220 @@
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+
+# adapted from https://github.com/cvqluu/GE2E-Loss
+class GE2ELoss(nn.Module):
+    def __init__(self, init_w=10.0, init_b=-5.0, loss_method="softmax"):
+        """
+        Implementation of the Generalized End-to-End loss defined in https://arxiv.org/abs/1710.10467 [1]
+        Accepts an input of size (N, M, D)
+            where N is the number of speakers in the batch,
+            M is the number of utterances per speaker,
+            and D is the dimensionality of the embedding vector (e.g. d-vector)
+        Args:
+            - init_w (float): defines the initial value of w in Equation (5) of [1]
+            - init_b (float): definies the initial value of b in Equation (5) of [1]
+        """
+        super().__init__()
+        # pylint: disable=E1102
+        self.w = nn.Parameter(torch.tensor(init_w))
+        # pylint: disable=E1102
+        self.b = nn.Parameter(torch.tensor(init_b))
+        self.loss_method = loss_method
+
+        print(" > Initialized Generalized End-to-End loss")
+
+        assert self.loss_method in ["softmax", "contrast"]
+
+        if self.loss_method == "softmax":
+            self.embed_loss = self.embed_loss_softmax
+        if self.loss_method == "contrast":
+            self.embed_loss = self.embed_loss_contrast
+
+    # pylint: disable=R0201
+    def calc_new_centroids(self, dvecs, centroids, spkr, utt):
+        """
+        Calculates the new centroids excluding the reference utterance
+        """
+        excl = torch.cat((dvecs[spkr, :utt], dvecs[spkr, utt + 1 :]))
+        excl = torch.mean(excl, 0)
+        new_centroids = []
+        for i, centroid in enumerate(centroids):
+            if i == spkr:
+                new_centroids.append(excl)
+            else:
+                new_centroids.append(centroid)
+        return torch.stack(new_centroids)
+
+    def calc_cosine_sim(self, dvecs, centroids):
+        """
+        Make the cosine similarity matrix with dims (N,M,N)
+        """
+        cos_sim_matrix = []
+        for spkr_idx, speaker in enumerate(dvecs):
+            cs_row = []
+            for utt_idx, utterance in enumerate(speaker):
+                new_centroids = self.calc_new_centroids(dvecs, centroids, spkr_idx, utt_idx)
+                # vector based cosine similarity for speed
+                cs_row.append(
+                    torch.clamp(
+                        torch.mm(
+                            utterance.unsqueeze(1).transpose(0, 1),
+                            new_centroids.transpose(0, 1),
+                        )
+                        / (torch.norm(utterance) * torch.norm(new_centroids, dim=1)),
+                        1e-6,
+                    )
+                )
+            cs_row = torch.cat(cs_row, dim=0)
+            cos_sim_matrix.append(cs_row)
+        return torch.stack(cos_sim_matrix)
+
+    # pylint: disable=R0201
+    def embed_loss_softmax(self, dvecs, cos_sim_matrix):
+        """
+        Calculates the loss on each embedding $L(e_{ji})$ by taking softmax
+        """
+        N, M, _ = dvecs.shape
+        L = []
+        for j in range(N):
+            L_row = []
+            for i in range(M):
+                L_row.append(-F.log_softmax(cos_sim_matrix[j, i], 0)[j])
+            L_row = torch.stack(L_row)
+            L.append(L_row)
+        return torch.stack(L)
+
+    # pylint: disable=R0201
+    def embed_loss_contrast(self, dvecs, cos_sim_matrix):
+        """
+        Calculates the loss on each embedding $L(e_{ji})$ by contrast loss with closest centroid
+        """
+        N, M, _ = dvecs.shape
+        L = []
+        for j in range(N):
+            L_row = []
+            for i in range(M):
+                centroids_sigmoids = torch.sigmoid(cos_sim_matrix[j, i])
+                excl_centroids_sigmoids = torch.cat((centroids_sigmoids[:j], centroids_sigmoids[j + 1 :]))
+                L_row.append(1.0 - torch.sigmoid(cos_sim_matrix[j, i, j]) + torch.max(excl_centroids_sigmoids))
+            L_row = torch.stack(L_row)
+            L.append(L_row)
+        return torch.stack(L)
+
+    def forward(self, x, _label=None):
+        """
+        Calculates the GE2E loss for an input of dimensions (num_speakers, num_utts_per_speaker, dvec_feats)
+        """
+
+        assert x.size()[1] >= 2
+
+        centroids = torch.mean(x, 1)
+        cos_sim_matrix = self.calc_cosine_sim(x, centroids)
+        torch.clamp(self.w, 1e-6)
+        cos_sim_matrix = self.w * cos_sim_matrix + self.b
+        L = self.embed_loss(x, cos_sim_matrix)
+        return L.mean()
+
+
+# adapted from https://github.com/clovaai/voxceleb_trainer/blob/master/loss/angleproto.py
+class AngleProtoLoss(nn.Module):
+    """
+    Implementation of the Angular Prototypical loss defined in https://arxiv.org/abs/2003.11982
+        Accepts an input of size (N, M, D)
+            where N is the number of speakers in the batch,
+            M is the number of utterances per speaker,
+            and D is the dimensionality of the embedding vector
+        Args:
+            - init_w (float): defines the initial value of w
+            - init_b (float): definies the initial value of b
+    """
+
+    def __init__(self, init_w=10.0, init_b=-5.0):
+        super().__init__()
+        # pylint: disable=E1102
+        self.w = nn.Parameter(torch.tensor(init_w))
+        # pylint: disable=E1102
+        self.b = nn.Parameter(torch.tensor(init_b))
+        self.criterion = torch.nn.CrossEntropyLoss()
+
+        print(" > Initialized Angular Prototypical loss")
+
+    def forward(self, x, _label=None):
+        """
+        Calculates the AngleProto loss for an input of dimensions (num_speakers, num_utts_per_speaker, dvec_feats)
+        """
+
+        assert x.size()[1] >= 2
+
+        out_anchor = torch.mean(x[:, 1:, :], 1)
+        out_positive = x[:, 0, :]
+        num_speakers = out_anchor.size()[0]
+
+        cos_sim_matrix = F.cosine_similarity(
+            out_positive.unsqueeze(-1).expand(-1, -1, num_speakers),
+            out_anchor.unsqueeze(-1).expand(-1, -1, num_speakers).transpose(0, 2),
+        )
+        torch.clamp(self.w, 1e-6)
+        cos_sim_matrix = cos_sim_matrix * self.w + self.b
+        label = torch.arange(num_speakers).to(cos_sim_matrix.device)
+        L = self.criterion(cos_sim_matrix, label)
+        return L
+
+
+class SoftmaxLoss(nn.Module):
+    """
+    Implementation of the Softmax loss as defined in https://arxiv.org/abs/2003.11982
+        Args:
+            - embedding_dim (float): speaker embedding dim
+            - n_speakers (float): number of speakers
+    """
+
+    def __init__(self, embedding_dim, n_speakers):
+        super().__init__()
+
+        self.criterion = torch.nn.CrossEntropyLoss()
+        self.fc = nn.Linear(embedding_dim, n_speakers)
+
+        print("Initialised Softmax Loss")
+
+    def forward(self, x, label=None):
+        # reshape for compatibility
+        x = x.reshape(-1, x.size()[-1])
+        label = label.reshape(-1)
+
+        x = self.fc(x)
+        L = self.criterion(x, label)
+
+        return L
+
+
+class SoftmaxAngleProtoLoss(nn.Module):
+    """
+    Implementation of the Softmax AnglePrototypical loss as defined in https://arxiv.org/abs/2009.14153
+        Args:
+            - embedding_dim (float): speaker embedding dim
+            - n_speakers (float): number of speakers
+            - init_w (float): defines the initial value of w
+            - init_b (float): definies the initial value of b
+    """
+
+    def __init__(self, embedding_dim, n_speakers, init_w=10.0, init_b=-5.0):
+        super().__init__()
+
+        self.softmax = SoftmaxLoss(embedding_dim, n_speakers)
+        self.angleproto = AngleProtoLoss(init_w, init_b)
+
+        print("Initialised SoftmaxAnglePrototypical Loss")
+
+    def forward(self, x, label=None):
+        """
+        Calculates the SoftmaxAnglePrototypical loss for an input of dimensions (num_speakers, num_utts_per_speaker, dvec_feats)
+        """
+
+        Lp = self.angleproto(x)
+
+        Ls = self.softmax(x, label)
+
+        return Ls + Lp
diff --git a/speaker_encoder/models/lstm.py b/speaker_encoder/models/lstm.py
new file mode 100644
index 0000000..7430e72
--- /dev/null
+++ b/speaker_encoder/models/lstm.py
@@ -0,0 +1,131 @@
+import numpy as np
+import torch
+from torch import nn
+
+from utils.io import load_fsspec
+
+
+class LSTMWithProjection(nn.Module):
+    def __init__(self, input_size, hidden_size, proj_size):
+        super().__init__()
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.proj_size = proj_size
+        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
+        self.linear = nn.Linear(hidden_size, proj_size, bias=False)
+
+    def forward(self, x):
+        self.lstm.flatten_parameters()
+        o, (_, _) = self.lstm(x)
+        return self.linear(o)
+
+
+class LSTMWithoutProjection(nn.Module):
+    def __init__(self, input_dim, lstm_dim, proj_dim, num_lstm_layers):
+        super().__init__()
+        self.lstm = nn.LSTM(input_size=input_dim, hidden_size=lstm_dim, num_layers=num_lstm_layers, batch_first=True)
+        self.linear = nn.Linear(lstm_dim, proj_dim, bias=True)
+        self.relu = nn.ReLU()
+
+    def forward(self, x):
+        _, (hidden, _) = self.lstm(x)
+        return self.relu(self.linear(hidden[-1]))
+
+
+class LSTMSpeakerEncoder(nn.Module):
+    def __init__(self, input_dim, proj_dim=256, lstm_dim=768, num_lstm_layers=3, use_lstm_with_projection=True):
+        super().__init__()
+        self.use_lstm_with_projection = use_lstm_with_projection
+        layers = []
+        # choise LSTM layer
+        if use_lstm_with_projection:
+            layers.append(LSTMWithProjection(input_dim, lstm_dim, proj_dim))
+            for _ in range(num_lstm_layers - 1):
+                layers.append(LSTMWithProjection(proj_dim, lstm_dim, proj_dim))
+            self.layers = nn.Sequential(*layers)
+        else:
+            self.layers = LSTMWithoutProjection(input_dim, lstm_dim, proj_dim, num_lstm_layers)
+
+        self._init_layers()
+
+    def _init_layers(self):
+        for name, param in self.layers.named_parameters():
+            if "bias" in name:
+                nn.init.constant_(param, 0.0)
+            elif "weight" in name:
+                nn.init.xavier_normal_(param)
+
+    def forward(self, x):
+        # TODO: implement state passing for lstms
+        d = self.layers(x)
+        if self.use_lstm_with_projection:
+            d = torch.nn.functional.normalize(d[:, -1], p=2, dim=1)
+        else:
+            d = torch.nn.functional.normalize(d, p=2, dim=1)
+        return d
+
+    @torch.no_grad()
+    def inference(self, x):
+        d = self.layers.forward(x)
+        if self.use_lstm_with_projection:
+            d = torch.nn.functional.normalize(d[:, -1], p=2, dim=1)
+        else:
+            d = torch.nn.functional.normalize(d, p=2, dim=1)
+        return d
+
+    def compute_embedding(self, x, num_frames=250, num_eval=10, return_mean=True):
+        """
+        Generate embeddings for a batch of utterances
+        x: 1xTxD
+        """
+        max_len = x.shape[1]
+
+        if max_len < num_frames:
+            num_frames = max_len
+
+        offsets = np.linspace(0, max_len - num_frames, num=num_eval)
+
+        frames_batch = []
+        for offset in offsets:
+            offset = int(offset)
+            end_offset = int(offset + num_frames)
+            frames = x[:, offset:end_offset]
+            frames_batch.append(frames)
+
+        frames_batch = torch.cat(frames_batch, dim=0)
+        embeddings = self.inference(frames_batch)
+
+        if return_mean:
+            embeddings = torch.mean(embeddings, dim=0, keepdim=True)
+
+        return embeddings
+
+    def batch_compute_embedding(self, x, seq_lens, num_frames=160, overlap=0.5):
+        """
+        Generate embeddings for a batch of utterances
+        x: BxTxD
+        """
+        num_overlap = num_frames * overlap
+        max_len = x.shape[1]
+        embed = None
+        num_iters = seq_lens / (num_frames - num_overlap)
+        cur_iter = 0
+        for offset in range(0, max_len, num_frames - num_overlap):
+            cur_iter += 1
+            end_offset = min(x.shape[1], offset + num_frames)
+            frames = x[:, offset:end_offset]
+            if embed is None:
+                embed = self.inference(frames)
+            else:
+                embed[cur_iter <= num_iters, :] += self.inference(frames[cur_iter <= num_iters, :, :])
+        return embed / num_iters
+
+    # pylint: disable=unused-argument, redefined-builtin
+    def load_checkpoint(self, checkpoint_path: str, eval: bool = False, use_cuda: bool = False):
+        state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"))
+        self.load_state_dict(state["model"])
+        if use_cuda:
+            self.cuda()
+        if eval:
+            self.eval()
+            assert not self.training
diff --git a/speaker_encoder/models/resnet.py b/speaker_encoder/models/resnet.py
new file mode 100644
index 0000000..fcc850d
--- /dev/null
+++ b/speaker_encoder/models/resnet.py
@@ -0,0 +1,212 @@
+import numpy as np
+import torch
+from torch import nn
+
+from TTS.utils.io import load_fsspec
+
+
+class SELayer(nn.Module):
+    def __init__(self, channel, reduction=8):
+        super(SELayer, self).__init__()
+        self.avg_pool = nn.AdaptiveAvgPool2d(1)
+        self.fc = nn.Sequential(
+            nn.Linear(channel, channel // reduction),
+            nn.ReLU(inplace=True),
+            nn.Linear(channel // reduction, channel),
+            nn.Sigmoid(),
+        )
+
+    def forward(self, x):
+        b, c, _, _ = x.size()
+        y = self.avg_pool(x).view(b, c)
+        y = self.fc(y).view(b, c, 1, 1)
+        return x * y
+
+
+class SEBasicBlock(nn.Module):
+    expansion = 1
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None, reduction=8):
+        super(SEBasicBlock, self).__init__()
+        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.se = SELayer(planes, reduction)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.relu(out)
+        out = self.bn1(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.se(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu(out)
+        return out
+
+
+class ResNetSpeakerEncoder(nn.Module):
+    """Implementation of the model H/ASP without batch normalization in speaker embedding. This model was proposed in: https://arxiv.org/abs/2009.14153
+    Adapted from: https://github.com/clovaai/voxceleb_trainer
+    """
+
+    # pylint: disable=W0102
+    def __init__(
+        self,
+        input_dim=64,
+        proj_dim=512,
+        layers=[3, 4, 6, 3],
+        num_filters=[32, 64, 128, 256],
+        encoder_type="ASP",
+        log_input=False,
+    ):
+        super(ResNetSpeakerEncoder, self).__init__()
+
+        self.encoder_type = encoder_type
+        self.input_dim = input_dim
+        self.log_input = log_input
+        self.conv1 = nn.Conv2d(1, num_filters[0], kernel_size=3, stride=1, padding=1)
+        self.relu = nn.ReLU(inplace=True)
+        self.bn1 = nn.BatchNorm2d(num_filters[0])
+
+        self.inplanes = num_filters[0]
+        self.layer1 = self.create_layer(SEBasicBlock, num_filters[0], layers[0])
+        self.layer2 = self.create_layer(SEBasicBlock, num_filters[1], layers[1], stride=(2, 2))
+        self.layer3 = self.create_layer(SEBasicBlock, num_filters[2], layers[2], stride=(2, 2))
+        self.layer4 = self.create_layer(SEBasicBlock, num_filters[3], layers[3], stride=(2, 2))
+
+        self.instancenorm = nn.InstanceNorm1d(input_dim)
+
+        outmap_size = int(self.input_dim / 8)
+
+        self.attention = nn.Sequential(
+            nn.Conv1d(num_filters[3] * outmap_size, 128, kernel_size=1),
+            nn.ReLU(),
+            nn.BatchNorm1d(128),
+            nn.Conv1d(128, num_filters[3] * outmap_size, kernel_size=1),
+            nn.Softmax(dim=2),
+        )
+
+        if self.encoder_type == "SAP":
+            out_dim = num_filters[3] * outmap_size
+        elif self.encoder_type == "ASP":
+            out_dim = num_filters[3] * outmap_size * 2
+        else:
+            raise ValueError("Undefined encoder")
+
+        self.fc = nn.Linear(out_dim, proj_dim)
+
+        self._init_layers()
+
+    def _init_layers(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
+            elif isinstance(m, nn.BatchNorm2d):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+
+    def create_layer(self, block, planes, blocks, stride=1):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2d(self.inplanes, planes * block.expansion, kernel_size=1, stride=stride, bias=False),
+                nn.BatchNorm2d(planes * block.expansion),
+            )
+
+        layers = []
+        layers.append(block(self.inplanes, planes, stride, downsample))
+        self.inplanes = planes * block.expansion
+        for _ in range(1, blocks):
+            layers.append(block(self.inplanes, planes))
+
+        return nn.Sequential(*layers)
+
+    # pylint: disable=R0201
+    def new_parameter(self, *size):
+        out = nn.Parameter(torch.FloatTensor(*size))
+        nn.init.xavier_normal_(out)
+        return out
+
+    def forward(self, x, l2_norm=False):
+        x = x.transpose(1, 2)
+        with torch.no_grad():
+            with torch.cuda.amp.autocast(enabled=False):
+                if self.log_input:
+                    x = (x + 1e-6).log()
+                x = self.instancenorm(x).unsqueeze(1)
+
+        x = self.conv1(x)
+        x = self.relu(x)
+        x = self.bn1(x)
+
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+
+        x = x.reshape(x.size()[0], -1, x.size()[-1])
+
+        w = self.attention(x)
+
+        if self.encoder_type == "SAP":
+            x = torch.sum(x * w, dim=2)
+        elif self.encoder_type == "ASP":
+            mu = torch.sum(x * w, dim=2)
+            sg = torch.sqrt((torch.sum((x ** 2) * w, dim=2) - mu ** 2).clamp(min=1e-5))
+            x = torch.cat((mu, sg), 1)
+
+        x = x.view(x.size()[0], -1)
+        x = self.fc(x)
+
+        if l2_norm:
+            x = torch.nn.functional.normalize(x, p=2, dim=1)
+        return x
+
+    @torch.no_grad()
+    def compute_embedding(self, x, num_frames=250, num_eval=10, return_mean=True):
+        """
+        Generate embeddings for a batch of utterances
+        x: 1xTxD
+        """
+        max_len = x.shape[1]
+
+        if max_len < num_frames:
+            num_frames = max_len
+
+        offsets = np.linspace(0, max_len - num_frames, num=num_eval)
+
+        frames_batch = []
+        for offset in offsets:
+            offset = int(offset)
+            end_offset = int(offset + num_frames)
+            frames = x[:, offset:end_offset]
+            frames_batch.append(frames)
+
+        frames_batch = torch.cat(frames_batch, dim=0)
+        embeddings = self.forward(frames_batch, l2_norm=True)
+
+        if return_mean:
+            embeddings = torch.mean(embeddings, dim=0, keepdim=True)
+
+        return embeddings
+
+    def load_checkpoint(self, config: dict, checkpoint_path: str, eval: bool = False, use_cuda: bool = False):
+        state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"))
+        self.load_state_dict(state["model"])
+        if use_cuda:
+            self.cuda()
+        if eval:
+            self.eval()
+            assert not self.training
diff --git a/speaker_encoder/requirements.txt b/speaker_encoder/requirements.txt
new file mode 100644
index 0000000..a486cc4
--- /dev/null
+++ b/speaker_encoder/requirements.txt
@@ -0,0 +1,2 @@
+umap-learn
+numpy>=1.17.0
diff --git a/speaker_encoder/speaker_encoder_config.py b/speaker_encoder/speaker_encoder_config.py
new file mode 100644
index 0000000..f953052
--- /dev/null
+++ b/speaker_encoder/speaker_encoder_config.py
@@ -0,0 +1,65 @@
+from dataclasses import asdict, dataclass, field
+from typing import Dict, List
+
+from coqpit import MISSING
+
+from config.shared_configs import BaseAudioConfig, BaseDatasetConfig, BaseTrainingConfig
+
+
+@dataclass
+class SpeakerEncoderConfig(BaseTrainingConfig):
+    """Defines parameters for Speaker Encoder model."""
+
+    model: str = "speaker_encoder"
+    audio: BaseAudioConfig = field(default_factory=BaseAudioConfig)
+    datasets: List[BaseDatasetConfig] = field(default_factory=lambda: [BaseDatasetConfig()])
+    # model params
+    model_params: Dict = field(
+        default_factory=lambda: {
+            "model_name": "lstm",
+            "input_dim": 80,
+            "proj_dim": 256,
+            "lstm_dim": 768,
+            "num_lstm_layers": 3,
+            "use_lstm_with_projection": True,
+        }
+    )
+
+    audio_augmentation: Dict = field(default_factory=lambda: {})
+
+    storage: Dict = field(
+        default_factory=lambda: {
+            "sample_from_storage_p": 0.66,  # the probability with which we'll sample from the DataSet in-memory storage
+            "storage_size": 15,  # the size of the in-memory storage with respect to a single batch
+        }
+    )
+
+    # training params
+    max_train_step: int = 1000000  # end training when number of training steps reaches this value.
+    loss: str = "angleproto"
+    grad_clip: float = 3.0
+    lr: float = 0.0001
+    lr_decay: bool = False
+    warmup_steps: int = 4000
+    wd: float = 1e-6
+
+    # logging params
+    tb_model_param_stats: bool = False
+    steps_plot_stats: int = 10
+    checkpoint: bool = True
+    save_step: int = 1000
+    print_step: int = 20
+
+    # data loader
+    num_speakers_in_batch: int = MISSING
+    num_utters_per_speaker: int = MISSING
+    num_loader_workers: int = MISSING
+    skip_speakers: bool = False
+    voice_len: float = 1.6
+
+    def check_values(self):
+        super().check_values()
+        c = asdict(self)
+        assert (
+            c["model_params"]["input_dim"] == self.audio.num_mels
+        ), " [!] model input dimendion must be equal to melspectrogram dimension."
diff --git a/speaker_encoder/umap.png b/speaker_encoder/umap.png
new file mode 100644
index 0000000000000000000000000000000000000000..ca8aefeac8cbe616983b35e968c9c9133eb41ede
GIT binary patch
literal 24616
zcma&OWmHvR*EYHd38hm)K^g(+PU#To?(UH8P`bNAx=TuGBhuZufh{229h*1{pZER7
z8Q*z+9L8n<8`fI)J@1*<ye8p_pQX@W62An2K<F~k;>sWp0s;sGcY^{CyptYL9146v
zb(Gd}27xfco<HFT28u#JAaan5_(xR_{eu-xA3d``?h~l<QI2>0vDH=KF8G)!m|%zo
zhVni_tl}HSUCdBeK@86scs*+DW^Dg^Kum!uHnw7%FGC!G_LA}rxKJ*maPc~`jd@$(
zm~4A%YfEvdSU!{E`uag^+%sqUtx$=4qCt+DH2p_$aq*-iJQFx<Y;2_|LmUbUiu}L5
zRKUC9gOM^}z()dMA{OvVYG#^uz%OO>%}u%%7MmRmsBq|+m^jsS|6cgM48rP*K#e%&
zucMxY(aa8iJ?5W`uDEW59DXjD*5s4yxmi?@Mme35W3JL|nbQ0c|0-BCqnMWz!=x9n
z_d0taVb2#KFd(0fD|1wn_2a28ZPS-a@R}nJKR-m%OIBF$W}Z;TT@(LYkdwReecC3A
zRy5Pt9M2vuykVoZii%2f5n-nzkH9<V{>BM#?O#tQ@RL#}eKsD{-Izr9c>mNRN`g!w
zF@t}{`~C(p-fS!U*RBrboH-U;OgFTCFqp2MT6d{MWbmK;W~w#@(Eb!AJ%pps3mmYC
zKBCojp9^kl*Om@km}w*vb-Pnna82fC!YEl$->KC}uFt(5`6sXq331nJn7fOYK8ECH
zP*=CF$3eg@c|)8;AO$&A{ABcDXu)dxV6|fk13UXHCxf7!m$r7(s5M`Omf|N-K;Rr7
zq|3a%A!9VGY@olHGhL`mgTeFW?7)wj2L{N5aYPm@r`uer+uKd*r}VEKuxK$AVe})^
zo9nBj<(dg*4vPKHO~hyP6?VRJ`)DR&gVa^jTYY7#D=H=2Bu!nMth;`U5Lr#`bzWR}
zD<h7)44Npk#XvH!a~Q9O%>CUa%Jo1>3j)sEQWr%M;Yh@Qg|sOH?w9qD<Kj@IdMeH?
zxXy8Hs!}T><q1~sX2o0%4+kDZg*DkQXIRUCNBmgJ{vXZR5an3EhQVMD12VxBhRRb@
z2WBciUKR7dug>PEnbMj&(pVBrfh)TV6-9^-<0syIC3Q!sL|^39)CSLr3h3J8LLzqV
zqWAA5$af^=Zj9W~O8VdF8;k}jY_5{(GA#`A<(jlb;x=q5{0LoQ58wiXK(E-RpbPF0
zcs5Jaknh$k(LU%z|Fkxmv+TEC>e7X^=LUiM+{N9WkMZm5>B^D1i58p8vCTsb+U$YK
zq21dGRwyq-AK*fTvyRX&F3O|birVNBwmD*5$PVKzXrGNjiQ3!U2ANEq<Y_MgkpO9A
zS~r(H(h1$lO%7Z~4>EL0Sxl$E2+o%mMI^lnSgC8#fcdTjFk9PtYX9e0`WhZEhsLf=
zYo$uz+G;#AXtfi)boB4!&8fQ=@IY}U+*Nc=x+=-U*m33(n(~6mGj73Te1EvO1mFC#
zJ7SwId+~?3@}&0^4`nNDvZ9NgL#Fi!=C-IH7P%@$V(plTD1p#f#vlsO4BIOy1G_7t
z%p8pW$bePJktoZIrAV5-h+uXZS()#;_$TDk(x66AnGxsI!p6e%Pia((g-<kz6hSPn
z3biFkZTp=1F77V({jb}gL~`L_VRVCZ>|h$TKtt)P!%(f?rc)U?-v%oSHNr3G`{qQb
zumr&cW>2}oj5;bxU`Kt$jNjO<vLTzt#aZo`yLf$UsBoH%)@t{yO-)>W@_?qxh}XSO
zJf8SUuz1Bl{~&ArAq&G&`Ra(z3g_$^CI*%scvI$Yu3!{eW4AgIF?ido!IQcPiI!Ke
zU`(7?YP7+1IPQFU?9qAlM+Bne)P!a1oQ#T@SMMKv;*)CzD0)c)7ig914dt+#@T4X7
zFATS_YZ~rW;H2|P32X&_P|(^vHZ<JwYGF(dq)eFD0#Z^+IJbiy>9#Lu$|fZR87D-@
zS*L5qzDIb-*C=Pw|57RcQwR^E#A=I8<;rxCqM+tL)Vtn&DCw67q9`3rq}eZFYP>A#
z2y;y-0U4C%^ESzc5n^86k3U&<eWw-{zFu`GhB|($d!dA*jE6!m=0|i^P=gLaq$1bV
zQZJN=ECwIP+}{cok?+Y2zPvF;Frs&}NMYzTtEUp$9eFkhJsu|Leew2avndan{Ind!
zn>TjQhwYhRW{s?|HM{VHm|A`bFATgFOMfz0<*g%T`8*p1QgHn`KeSeTPQ#9W=P)lq
zPDM=)svk_qNIsGl9QOZcF&zt>ghd{(Q1tfu2sES(Rys+w4qY};#I%<~jLsi8@zs9Y
z29{fO)e_gD@QQgeaGu#_w{G*0b8@a9?;TYgpSgL#p}=Xxu$R2&Z~SHEvz%^}i-kQI
z{AZ=1?08W4qwEeYA1;?nd_YvW*tNLK@E(iWXr2Ef%zFA=RgKR6b!orP-oD=`gyG?x
zxEd&fAmSxwJ)_u`yz)qK5}Nm7enNhFFhNWVHg>cQLJfH7EZQ3+ErK>K^78w_;TS$c
zh-`R5N`BZ3<9XEPP@(nixwOAx#Zl}{tRz$j<c}pP-PWrbm+EymMIT8JTJMCPIJnpF
zSJfP%+q7`}nzEltT-;^V6Y8UqJ7IXE&jfN-QaTcZvTU1T*1VH7j>tE~C$9hnWZ|i@
zy~2Je)l4=m@X<^;<aUVuin(&={TOaa{Q)A|bHL;~{wfuleQ~X3#D*_?V{nzu9*r-K
z2*Uc%X7w@nfx+f$hH9o4BSQ%f*!`95%O060nQU~xrt{Co^!4!hJSG$^D{`_Lk3RR2
z=d0kFCt7_;KEGD$i%|OAXd*7YoxLByfJPl75&7Q0M`$B4mm>*x@<pG}^YtGNGi*L}
z%AC*4l?$#km6u;+Iy8OCDAc&+azM?sOo$J2zID<`K5f<?-3V`Jgsa6Pcwbf)qD{f)
z{;Z;eU!8lMwj(w}e&1xUiGY-ONO=wJjmMtu_N1)g45!8)4VF0PdmO?oRUKWT!Ynyj
zp3YK#jb-;uC7!lFdcivH4NpG9#|4U9*Z6Jvt-a_d-S2!$T8^;fe-q#*CteWd3p}Hs
z0MIU1;j=K7bTca3s&XeLovn5!`E|Go@F4`D%}s|^UbgKB7qmPsP=SHAQ?b9%Al;+U
zrX(D~IJI0!xQ9NIlk=zR+m+RHh&JvdFUc^K>hQ{DxF!6q|D6?6`Aao?k&~Utlkno4
z{<m^^9j!O<p>U*nuAYSzxEV<FE8_wxXPHl@c94+7Lzco9-|F_0@OF>pGj{^%h1$Y)
z@|*6N$o?QGy6cqH9eo9EJ~J^<Yrn#0LXSNuL8od!w!1UY%rmJHi53j!vFZmo?;~L&
zjcI7!WupGG(#~6JLs+Sfcpt7M4Z=BTzcs&Remit)VBq=z$l!j;gouHH@_AAjS&NE#
zNpL4iUk`hV+RRrcqejgFvd1)HBZ_p1z_GX9^K|iT7XkfVJLza!Y;nx1T@2Nx$&#46
zj5N3qzE!<BF)GCdi7M%)Rx|ouG^5&uTe?J<0a};A!WbMpmw=P@sSmnW$YEh_UZcZb
z$%y5s|0Dk<&%o4gk4w%duU0c2>)VA;evZKt-^>D?8hXG?gQM7np&)iN&9e?Fkyp+o
zNWC4rAF;@KMSw!Rrc;rW@;kp_znXn}`&FWr0`FLnoG%0_{NwIywg1tusxrYE5A~a=
z7u8E{ZdxJgsVO!WDe(de#%;h!$y3ekm47{kB9kvqOgvrWc4Trx-oIC_0lzm}R0bt-
z&s_<_Dz`7^^%&GO87eZsCog=};~dWU$<n#dL0A)}wp&=_cqP*d0b!B|fKQ&Jt4~SY
z_T67?cUZBZ8~?eP5~X7gH}N#1!jfKC{*wHQRhl0z2@MG;6K()Ux=<?vC3f`7=S{tm
zmsDf@`B^+xC?v1$b>g75ni)Nv<4DdMs7FSa<a0VZ8Ej7fJ|UynfcA4zj1)zgL51~L
z>XB^Q<6RexxXjwMf`IEKeg$0n)fNOK<hgNqn4O*3Y>(^+t7=aBR1AK6Y^Chy<7;ad
zGSJqx{;0p2v{xP#<<jjuhV}2}TiUK#eN4?4!Z2H7BSxI`5ygVv#viGSnLacrY{pC_
z75-pjzYzHAxIl|furqI7@421h>nar^NrByw4iWt0hRMN^oyKdo7iFb_lj|7roRam4
ztaUzA_|A<qJ)L*o{M39D_8oTZ#ZUSDdm^54wb^DgF+$mVv`L*rp@cT$<m}w3N<FgI
zcyN4pR_?&9ZN*+g&2{+dI(C*DBrWHfj7P{%%9T3~U7p%Iy%Sn!gdATpr*TWk=(c<Q
zO91;Mw=9G+-5xIQ&OC+-WlkRo4%y8YEJj|>CDEnttO%3M&MYhYw`2b*%>ONy@}X(b
z*6X%%cN)5QrmrKl9e=k$-JAIG>^#Kxz(q186UZm=(t#v!x_GffeuI@*{7a|*dRffa
z2NU{<2^Ue>T_-+fojS(1-#`n8O=fZgki#oW4sGExLFx48WORpOVr3fPJGt<<Emw?X
z5zj$7(Jn|E0zp?R`JZ%CT1}#*Tj)SqNRNjU?qu>cqKFb+;vgdVYc!($&)Zb7lt7|D
z%)pTHfr|hR#Ase$Uans=uL&n!pd^T}Xsd3rvRkY0T<Y}2$7a$dB`2eP@2IaRsGnAx
zU0Pk8hhma5b2lv?{0qc@XZYUWk$^=(9Bp6fyKVXKr9%-OhB<4u)UKwcCZSY5G#ui;
zbJ6GIV*7A+*$G=Yjw%AeY*NqGNX8o9Vtltyu0f}w+GWQf|J4EH*pcV{F<_jGfjM0@
zDVO^P^~Gxl6{6v6x!URe($dlm=X2pz!ppAkBUgPT-ozV|Gzuz9-yziL&O!oL%E;Vi
zx|nctm=3tE2Dihd9tiBXQ}{Fm*hxa1-;t)zH!IL3dNjY=5xgN=dJ3cNcPneLY1C$K
zaxAoB9#SHrUA0X!vm$D-*w^88|FTr^67r&hor9%z*S4f7Q?@Gomi*FMkjUuH8}V+U
zX<#25F^n0)+?=yT2JUXI4$JKlvZ#h#%e=uaZo4xuwlNV`pBz)`E?oXoM39zMS5COA
zU)<d{9@}CB{}v5vsW<;zRYpMJXMQRZB%1flH8i@Yq#KCD;^G?vxc+`k^%W!{oscSP
z;s7fZ8}m=O*0$WTJpVbHg7y60!ymNf7H0(=t{6>L22IUP?+*`i*-~;aVE>91Qmqar
z2}!S3KRs1N#}HPOfcb52SoUb&)bw*+0;Ca?A5y2uPC_ZyR8uRUhe$y6m`zWA=Sj?!
z$<(_xQKLeNQ88QI{Dn6=Uiz=N^YBS%_FKT7pD$!?KjRCyZ(pvc(X2AiP)n-OY}M?{
z;VLqpyl=U?%)fQ6U|_h|pJmYasY|<<e%J6n0+7ktcm6PM5qAE(K042T#f{RL@qeqb
z=QXR}R8&=Q5&+M`))O&F4<aTeo~zQaV5A1Lw^b2HqO0`UfKzTfg{Y9R$N?h5?M@la
zUR+%4n_9a%gjg{0rKhu75JyV>@8hD){%xB&Z`0uxL>ABrk&Sl!Hr7$UjaK(hr2lti
z|4#*y+W)Qn|FhVKe<Hq5Q1)tHJ<3^)8LKvA*?EPSMNiLF!6_`vP0PSAyQ!Syyf<g`
zZ+l;cw$I5*EB1HyW5p>>MQg^lFK@l>?idN2&As|A%2br~#)2@TN9bttIv+1<Foo}H
z7fHQs)zoIk{U6WQ<e$y)vzY86WWLjRd{u-6MFeH77Q#djIapX)P7p1L&}B%4b9iv{
z)&t7!a8-FwIa1I6%Oxo>ZhpUGJ%R!JaOgn7@O+aoO$P%gjNe^?LJ72MWi?&$Nye3z
ziJ)E)PkwsFk^>9v4WG}?`Zn6vFYTTjx4-h%pPxHLEAS$0b2MBjy^ZbPboy5Sg^i^_
z$rf5H0N6tfuPDFLr8hx>cQKtSQ%wmEE)B2Oy0N&V=tG9EzxJqU<YHy{Ud{t@yHM`w
zM+?IwSdhS51v$ClRPaU7>gwv**;(5$xUIZ=XyJK*J0_MHO~-%U`6^a6cuYl=sazP<
z<g9`><mK}e2h1B&Qd71}E$Aakt1A^h4LyEy?NYQne*7j!Fh;hExTo}4l#`Pa(1GS<
zr^xADpZnAPXP;AT&?!Vs-8*4>xi<@1f;@lV4j;E{=UiM|<UZYR7Zs6t^BhV@sj10N
zdwF?9;p09-C35B0(c%>4<ssch{{zhQjl;^ztr>~OHZCu(F5hv{sZ$1_Ac;xs?ep`s
zO-)Z{(T`bjwAvhlzuvd+ML4ys=xeFZ&Cf%ZRws|$uG^4-I2WZLPz0=!8|veH@3qqE
zIxcv<-1QBakWn*k0uiKMrTxNizFP0aix;QA0gty>Z;_gutW~96F-y$cxezS+j($_z
zt3YJ~_SshE5<F?Cs+2_KsRh71t*7T;KtP%tx)vH5RK)nApK{rHP1)cU{CDEK4GkC9
zgE2zIutgK6j;|ejpS~&5H5eX|f~Ku$++DB5^Cx4ft64vbhGM*VPe(T}FaRPin&CJs
z^xqM_>B5YX2*5H1mTEIs$*Qh<|9j2R+Pb{XWt~{#c>idR=k8IRDEy#m1?vp<ev7H~
z?zq$3*2TVW7N!pxJO^N_1?xtd!T(@j_1BjquW(#kuR)GH2X{+1)qv$ZKAJ@m@F9n$
z;l0!Sy#1s16%_L5r{G}Vrd~X1?%bm5`|u#x(GecYhg*;p^w$6Wi-gj%HtST)3E!__
z?(Xga2i_+sUa&U`6>$3daQ(@A_<*50$Msf?ezy*alyP<F2XpT{5}Kq7ipxL(#lg7k
zPh8wry`TuFnB)}sUrc}P!x=Veju5a1+=YaUTtvl=HOPG70gfv6jc`tOvK3pp^1+qR
zA4VGaSX=c;s}=9#4)u!5b3kt_gwE4jTU-4f=KUKNZJ!shy_kES>+8_Dw!i;oXK&%`
zf()wtWYf`J@HoIO{P@vb<=ZMR?}Dzhyv=eO11ZmJ4J8mmLh_aH0kkgy3j_qeUgNnM
zT-0}r1d-!pkwSO+A$Jy1rhv%++SPg8aSRyWSnT!Veg7S%JE-n^LD^(06Nya<V}+L2
z;1RdJ5~WgyE_{f;NKVet!Fo75LvENQMO~WwnzZV4VUjY6QN51XT5YivnI0aO$sS&A
zr+BA2Cw{=9E%DqsT9gwG1z*i7?3}EPghZ8z4;Eq5EPU-UoQKdy@T+Sy(>g_eYI+%u
znYD*Ja^>m*vlIDUIRPX1ton(wJpsrT(uOzp4)<dv3zXCv-^)r^ErP{d0}WY(6zD!H
zCCzl{p>2=ma01S0?b0GhChP+-GFM0A=B_Y&+^<tTGHdI~;r?IKvvA~{PEPx4V&R*o
zIeF47c&H)dbr#(fu9@6SKOeWun*F%ysDTKSx14pb3Vsw}d5wZtNg!wR(a*&B+#BM~
zXz*jesF@^YCQLlP27F5pAaTBp9~CNC96pwNm(9=e+OokCk1g-RBc>QdTLap!EQT8S
zQoJ&)#a!<{CAqNqCqm{K6D7Jo(5sO|v0&hE=FXg*0XJJ-9vVYBD`!5Q2B<`f=l&I@
z`|+ZG?NI|iPBaQEXeWPieoA1WhTUf?7F38>h$jkCJ10aK`0(LjE0NjWcWW?aLn?LI
zMdza{dO+Mb10J;zJSPYx(dt;IX0-KwuSfJbBP;u$VeX}E;jUnzaXoprF`JA}kI~=0
zSiiKyPfVQI>g6RTM^n_Qdlzpy*>wu17QIJJFg);MbU+(7`kjSalVv3`2u1J;D8EE^
zR>X|g8o854(=}M#jqWMsp0=h>Z9rTl6)sx`eS&Q`uQ=6)D@s2z0_6c~A>9xVfMCsb
zm{Jr{d$9}=%3AtJ-)qS`YsrOIKat@@!W^&lAJ#)`s%064nPOrlB`BHrEp{YhKTe2Y
zgZ`?iR<)j2_q@JP5*(F(p{6qB6oO8*t){-IapEEX7#P*;W){D9@nE5N!Q?~e-oNx3
zGwSHOCh4=>>b8!-t=FUT9BPr#QK&E2nhf#>ZS_vVGjT}#RTZ440ki26(MVmiM))%{
z6chASCDnj$yL>-NK0M5_a&z-9V;aB3I2fTzv9z(HprQ?kPfL|t6|*=PQUs&9ZI+J$
zpmO3_?{DP{3~cO@Zet-afacLeI)GirL*^e49zbUd@9&nhy8{D^*)o|3+_!6|eQ>iE
zI-5=VDpwrZ4-OAsV4;VC5Cw7^?$?M+RDfe53&Of>X&EM9Uao%%fCyrROTz9}!qG31
zjqjGpLFq(97R_!^&re|xdT4(B)RRXisIcpo^FJ)D2@NVPf7j^`{2+L7MC0!6ZFHHW
z<_~kfO-_Wf_1nX2)Ay#|xzs#xiyW<{zN(b98xPU2sA(}T1=5W^qSWaYm$k2c*COr?
z4jLNnEO=+g@bnm=bt}AvIQfwZ_Ex+BV<9F1q@?s>R9HKg+z32h9HF^nH;=TP*WF*s
z2O&AWvIg`7s7RD~it8Ed@@llq`UGu&4xGv=B~^|@0k4d5S(7)0b!k$35=#Ke{z3gT
zQ5{Yiq`b!~C4%U?`al9bu&yM0ou!E`g3(7KLYMD&#v!Hc_3Qu=a1*igD$j-)Vmmt3
zI@N>X0wy%#;Xro~opXi?>V}@G&|=2U8Qc0R4o<zZ^iHNsLGINw;4*^e%PuI{mSSck
zDC6_Xrkfu&6#~G+6raqUcSjau8aT>i0v9BsuJE?kNuG*33w(#B<X(RZ3zOzfk+B5I
ziv77N_7t4#x}NjjpCWz$E=5$7HGRg?WqM;Dzn;Fi^0#SI6In$?S{r#y{$Aa&j~Ht@
zdC=(E&#ek*WtMySXef}35&<Z}duC>T*kSWVbA?9D=Ta)-bim!XU1iuP96-;fzCGoo
zS}HZRm7Ruog9SYYUxH$NZZSZ2XLSMn_4~8i&$XE)fX9}rRYXaMZ~sb<C%W*IH;*Ka
z-l0~u_J;bIoF(9AqhCHgKHg-#L**b_&OK2A09$1NyKgi}QTL+svw7juBI-)40a5qx
zRG+d5fbRztfm%&g-DHtgoMSJ-x|nZ@&BwRnuF9BAUZ;gPU^ZU4(QvR1vuSrzRH?W|
z8}|thNS38_KDe7(SdhqtLYf_N>vl}d$Fi&#iQmz@%TTP0NrD`C|H<y3=YF%GqbPX$
zW4tOdx7zDm*w=r0Besc*-`btGgi%jR$YS**76*Hh$LR=r8;uop(!<>a=KZm=1~80^
zWuNDIgD`dV{?MN_1s?R_fpq>o@W(8lL^=Fs?K~}jTivZZOiI80g?{Z*V!lfa1V;e8
z3t%s{x?eh-Vn<lX_CkX{Im&%xY0m+2K3(7eJhRZrIll|(dv$mOy+hPX;DQ1`1N+JV
z*+h=M?Zgl0jZaWi0DSVxv>?m-4EIBH0&hk%)LvBqyS@`IpeNznOEm5B&O^14mXyhT
zAqu~w;p~c^RzAJy`f&c6zJZY|x`rRnO(`|O$v;1G_TGaYd=U#vuTJu(8||QHo?ckr
zP#ZJQ<@K+tO9IX^i3N3X_wXF4>B5si#r6f4T=+_cJr~>#n$%~+?pIS;eRJKTD?d5~
z@m?TOU7_C`3or5IQvr*1(FZnTb>EQpP|^>1fyF-}_ebScD-+hi)Ws6OQNK$zeD(z9
za9FTFTJdja6vPWe3<6Ohwo*=}wk_ul$a8Z61rG76PoggGSK@aRVw({}_dZI)aR|)5
z<)3{Oz+W!p*nSv(wYp`<Gi-e8@^LQJIYiw<aB>b|2RT<3MMM;yzWC;2DX@?dpaAuP
z-H~K`c5!p_^7XTbpk(H5&adPE`fb#`Mz(^YKvgwYV*p)Puw*PcE#!TTME=fXO+ub$
z{0)d4F+Zd7JQ%baiS$&TcUVh%xx@iuS_H8OGi@H`062HgUDDKOv6_}uxI&nBmG^g6
zEO^DOTD@%NGcKUMw~cw@u;6|&@hd=TXyH%LI2q6C!pwvV1rw+1(o9B1`OsSK>vW*V
z23gzKL`!hw6sF5dqX~zsW-ZMB{gaGEeBu+kefxHtHSLIr`DYu4LndTfgR?^^xcjOg
zL*{gx>P}DYx*3x@GZ=)Ge6||#)qD6=RP=LShO|l;Y}X%}QCw71oPtM)k55ooRP5CO
zPHAew9j!VF3_Ro)QdM&W%90J*@dZtqyhZ?p)NAANW4OpcAXbPtoMQ{U6erJztLtL&
z?TdSKzdZFfiEX}G*3oygGT&3Mj_{wJn#Mu%t#3CxEF?8)``Br}8zQc1jRlV*EVv4G
zQ~n%d#@HzYgeP0h<|ex*!;<py#ESRZqkuQ*Z88Q@%QGkEVukD<-p7w(p|L(cZ4}C<
zwFJs6Z%W4T{HG?$G=n{C-8LO`TAMFC+eESHO;x{X%o^jCZp06W<TK$-GCME~v{sSZ
zcwEYGybui#s^hyF*)sH)NmMkbJr3SrZr6QsDywi&8!T5)_4~c>6$l*Bcx?{}MUnUt
z007pg$rz%YIm_pNWGg=%Ls|i?b9T4?++@e{K6-%esW30Rgr@g#W3cxW(*5{HNa4=|
z9JGLG1w?Mf2=3vFG*&k&>Z@_3{kq2Oh4e|JrQ2=M&`yp;V9@?WoEcZMP2EXhq@_n*
zJ=w3{E*5#CH<*78D!uLWI@<JQTROd;7|?4tyoir;oSW;OaRf{)t#mD|k|mE`TQ;UF
z?vna8$jb52(QdY?s-^800D7}D<s^S0`M+EMYvlt@`YVOM#ZP%fv|l~YL|e6PRK<%B
z4V8L?H85aK_rJj#%<e6tqNO-EXEsH*$5-Bb-IH5}3wd+~OruYkUq=jbcOs-w;TU>U
zQy)SwF?E8l;*7!##_2lWF;kufTCAnP(%ybsr;bL5e)c~cF+Y)!t`7KF09jcK4Ps^|
zDL1*hHMzU&I&)vW<yn}WPK?jd*Vi_-^^yhbmFpnF$ttCyX}-c*nQWxBg6g<6>!Q=`
zO5%5X%~tCVl~AywZ~2u|DhbYH*XIJ{Og?fDIu?rM(l`0T&UCGJ0SMrLq_j1s_T^0^
zETT_LPd<gWXo(YGd8Rg~z}haVdFs!t$`93A0vv~44)gknED~lM`cs?75eB%}lQ(OZ
zUff`qekbG*6S4ZIZPB7n!cU5zfx#g@Rrf*w+?iOMeS;QKt*K^j>&{2($uXIJGx>MD
z*-Mbw{R$#KwA_H_TQ<*;+P!fa)V<b|5}UQ-LMJOSZK1cnB8{Wmb4BzOMef$c``CY+
zF4q5VXJ#7??K{`t0$6)sdiAs|@nmg@_XgQMNc#Kv&99I?1n*Vr59Ncl{L6$GWs%he
z_Bh3K`5<z`uU6J`s@=GNcOALCe~iv--n%U>sNb)WKd#nny(ttFBZxHHJS);^I<~as
zi%(AX9GvPc`;b~UxeGXBt@rK=V3xtF;8+6;g6rG-tXxi)9vU2U`S(+(pt!KAdNNhp
zdQwcZK6=zpEnYh`_&-N4>1guGwHZyVeiU>~Px63JrQ1&-56%|4t)sUg-3KV*2-8l*
zE36L~8<fDBDxa;Dy?5<Y`}dkOP=C%ETiIIYXtQf6tqb#qq>XNEMiNR=#`nJ3Zp$j+
z1??541?{vCH)qrg$$NL$kiSQUdEyXeqoz^W54_>Jxh(ry8T#49OAQ@|p61lOw`5%B
zix*mio~Wp-h%{W;(5-iMu0yNQ(+v0gh_>f&c?GVp1c3YX>?JdRY=IKJTPhZRYkNGG
zAF^P~Q@_~mbjvop_<>t!I2Jp4z`;V*c8AELvazkl2(o+S%kqctPL@3|{j*f{?7eI=
z|NBkaa}ygxux1uNJLhK07`Qa4nb5R<J|kP(t6EG&_(!V&3Vl()1bJ|fBCGGyW)<ud
ze0`I@&k!Q?30Q=Tndf4>$)!9K6MM0I<6>ps-bdvjMIJ_wyb7KKt9mXR&AV2SIn$D=
z!I<p4yqeP74CC?Iw3>%MCvXlTxE@6i-x;zyHwn1scsF#=z<><N%{yD<tvF6=eF*l-
z<!2->1QiBFUE-jkgmv*kepPd4{6qDtqAnVuqAni&uB$s>1S8odT$xc!AT>4jqeb^W
zih9=N`xy-C{G=thQm0B5i8*0oO5rPa!7{wV?0Qeld4h#j#GvrF@`pX~lDb_K`OZSi
zLxm45j|}zM>FU0^BMZigJcUi9M?kU&SPTGnZjWTaeLotQ#B@Il7Y|W?+*4Is>2QAv
z6mt~~^Q^bX)<c7b1zFm5%*^KUByDOmGWo7<rc>tJIw*4SZ};g+d3Oht@GEL2Pp%v+
z-uZM8L|-Gr1<aafs5+OUw*d-en@pgf&EhZ?+IBVbhIeXlMNL~-H!e4I&&A945r7Q)
zaA|=6jW3bT?%XGRjV0SdC2q2tk#(WbwsH!=OLRJ7l}^>#8DWfKqM%F9;XMDbZZ}Q_
zX|7XKyX2io^ETV9#RQr+H|)N$(Y>~-ZBK><oo&rEP{sXEGU-BgLoKA8Km&+a=SisO
z?}|{30RRD*zdQOc$lmmtnjmvG0b;#npy#(P;xIudGwzx+{<h1D<GBt6!qux+XKmhI
zB*d9sUN9fOPIhW)28R0f1}biT$R=NA@_9Jmn;AE?Mx~guFGVf<b?B6zV?X;3`YFof
z1oP}4E|&9F08C5wGwXBewC4vg!;)XX-j2}}rpSROxSF)MJ@*4B0_!rKAwL0<VWB(#
z2W*D=@%y!TUG4?R0w8bxx&8mir-5{aa6<FB`3-~(rhgO=S^fVC&=L<n_ld>DX&W7&
zLMD@gBU!xDnjE5^M3cn*mL?tpA%yWaK1Or$w}eOJcT1~*(58#_)kFP^wY&8~(xWnY
zaT)tt9id;y-;bJN1<_#3lw(B3IQgbut{?jdTki_u;uhAfI)wjt8#>k1O?pmX$1MAP
zFJs0D?wRw0&iNgm&CQUh;*ZJ5vjTZs{;QbtAUdt~>Fsb?R8-JMdGE@``Gxg+kCysQ
zVVoBrI1yPxx#M4r{4utBm#?(}tqks`5+5`cQG1ss-p1JV4J4W|hlN5!l8vmhB9!h}
zo)d>7Y8huxV?+ENxI63bNmf|`eEO!jzNa6s)feB>*;MLl7~DrVx%e<2^BA8R?cEh+
zL2<wS7<Joq4WtMipMBCWHUZA%*8;8h;5vJmJdlNdFZInkq9s8l4&~-$Hg6^6{%f`*
z|MEL#+^%WU?8s{%3~r?Xn=p;gXI&seATO8DxA9=gdlb04ZI}v?vmjioykn#WiP9N-
z7<)5k`7=(KZvvO&*}^P0c3#VC23i_!Omf3MXDME}eadZHR<(^l#s=a10C%pEoVGF&
zmYQsJZ%!Xq@z)hWb=GPhR3y6S=VCf;&R3#g-XJ`MQQe1IE33t;o~6DHLFA@}BJ)Mr
zVzcDYk<XQYio_EGy&k`ZyS~0xtO6fpc9J)0e6F*P%avbzH;pJeZdb}sdpC_#5Eq2C
zUbMk{c(8PPk`Y=lRe>Yy{dn&p8(z0VY5k<>zOHDir*obMmiI4jd#rF?Z)6s1FWD8T
zA(+LJIPg2~nROqn)Q=;q0fheiM8xk+O>>&!9K?R@5B<*~q(4(kFB$f=EcJ3ssvMoa
z>gl3A((!^S&T85$&li-5WotxxdsDodHD-r9xj-VhzLb`SpWs0KzAJ+H+#zjw{hy8%
zkc;*?;BGr&Wk+tu2+os`HtEIDm^ML{l39Hg+XIr$9HDDQ!5e{}vUJdUsX!Nvw-^q`
zEa^$51aclAFC8DTL%`|$1zIE6*_i#y{cl%qYwPe%AhYX9GNX?vc@(H1H6lVw%L6pE
zks;pN7PV;WJG;}F5lnr%<)_b)&^PKMqLDNst~fiEj-7W<?`>UuWzA7sfiTPbQy9Wq
zphxJ%2DFUQ&*~DH6{2OrXfl|XbhLe|#IE7v0(xAF%?w%&Es(u1d^SgGE)c!eFEvv1
zcxOB=NFTmCLk#wjy}p$#HkHXm68PnhEUcU8dU;2ea8J06FmY<}QF`~%R*=CSz3)8I
zqhufKf+z}$k5ifUgb(<>)E-~?YaI-casl3ACx7c2nQO*=Z{MAS;WG`>Y@_*QQCW4p
z797ww!yOGNMi0TkO)A<w@BDFP0V=jo@P91cwo#Y?d))Ufw$w9cyX+n;FNR!ge;^G5
zU+pi@4;_vDYTAx9`^!J(BRcMis#1t)Z^fQ2&WH*JfJ0METtzL_m>?=DEDZ50J8Lbk
z>6^V}faF5>wg&LDzM~nlx1J?S&WIuaX2;;rJ_|iI$C0HmBICm(FTktzF;SapvzgO#
zcVB+fsrFsKb3;Xit)xe8%GQco9qM6n8GV|dw7rXNT+1!n{1AKx<}w((sQfx~5fp;r
zK?w#MS~qp4IGy=NrQg!W*QVUiwM{QC?qJ^$nX}6JTaf#SiE|ibaUsDC43DxBME4AK
z-cAKifC;l<Yr7({Tx_z6U()NkWIVM`4>TrkFlJCdm3JT1eft--mS-PdBR@unkfq!U
znlFDHaWeV##a3(b$63P}jY1*~XFUsHO9YO|L~tSXi1(4r^|4QZ6)dtpn`ka;T~(u4
zu#@fZTPn$lo?l0oS+hdA=VD>kme|Yksq^7!K4K)Q?;srMaaWF`LzygPbi-SyDrOrk
zMF!pD*>a@5!GX@ElonG8#G$QMlMm;UD50(|lm1U+-^*Wr#$|ku`t3TqqcTGq1<G(C
zU)l+jBVBoB9!boXqn6A1%(wHe`b!_07cN6!Urt^uU(VGcZDh)%<+SXk6Yd-MYz?rx
zkGAelt;<C-ecn#>_Pf20--g`Vy%_uIy#;kvyH)EWC`rsP3w@l=oJhw-Ed%-ZzBBMg
z!Gb~UCMzyzNODUnGIF=28_|SJ2GsnyHx|aKS4ckdOxtbN*KZrBG`X8Zf>ZRjKQ+I0
zGoS)UC)FK$5k1EqlH2V^Su#8D<qxbFLa{O50>F&YqPSRL52XSZ>qOLf+5Q1D0bRK@
zG$h2U5=W|}Uf8ZR`TJN-)-0E}X<iSt3Pd-2r)SM|PeiIUU7Jv;$&+u=kZX)KyIT-#
zQ=aKy?eEjkd`<f@@Mg;1^nJ=nXY>7W^e74S-|}@eVa-Q`j+4v2d9oE)sqh~1#DUf0
zR-Xe{<05C6??;_QMYqqT9y`?Ytk$30_6MD{s_2mB-%0mKX=!Pvld);siNuLM;knu4
zBewM9-Bdhz1Er*tjWl<&-F61KjdA9TtjYXwT=+`2^QdIn-A@41=Cw9ti84N{&Lu8C
zt@x8riB<w|hO)*|a#3%;A-&<(>x>jKo}Fi@RFsU@3XJjf*cuK)`d(&v3qsE6Ce#<}
z8ZlzReK2N)_YLToL@$Mg`R~3}Cwb$yoIDtE)J>>?!-s3&dCmY?Y6_l-3-H+5VLuGK
zlB)JV{3X`*cFIIiH?zj>hpUtEV0x*3oJI$E4J?QjXD;U<7LzI@9}}l~1L$-!^}_!t
z3ccbW+l!Xu06}tV9F`&MzGg}vJ+;mAoKL!c-sX5ha)iMS8>rspjpjy=K<2xy2DFez
zjyhlpf_!~cF7c(_i<-t*yqg4`VjpFnCV2<aLj@Jqe$|Iz91+oRia+pxv0t!Ho?r}V
zU~ta8@kIE@jY$3iWQlG2lg#Fb%$nu8p66<A0+gl7dVF42ZsJgI%mW&E<fs%APGgpA
zYjX>ot<8~CD+5FBtNoDDrv);OlT{MR%QD!9j}G4qrH1<l!^g$J|4z1op1V2!!nGG%
zD(NQ&Wr+8Garf@v=j`|E*4U;B$)o00XKaaw3k#zGE3CQ<DaAMgi_i1mkETY7H)Q+?
ztcI*wN*guB7vEV3w?gpZw_#gX9e(f2$TyE~4{oOoXUXNeuMCVJlsm$3axc|=d>1PB
z9wiMuo`Px^){y9QLe7#cm(VyP8Yo6O+0|F?g>r5<W?1mp^*b?Nbp}+teSOt+Uc<cr
z*|U>xEGB4uSrsv>DTrhJ`}3$Miu6x0`_$e>x4tD97j!%JZbQ#4c<i^waQ*kRB%4lw
z{hOoe0y3AoyRS;Vg^X+CP;>5Wjl$-)M^GII<K@SO8w+e5@~z#MZrLaQ*jZFx;z!d`
zl!fkg3TnW2)^vCoe6D-Kj~l{U+s^6SAj4hgMl{*$SGbGHN?@GMj+gQf&@X6cYA0$W
zLMEBG#+%>W+>1#8#396Q+b8`!F!a+HUt0zezX%8;YkUgH_bhGUyIQ<RSolPlJs7dK
zrt_+pU~Oz0`^xD13tmpKP46604m9af>{9FlAGA~`@_BG(DQw_K>g_5%M4*+U@uG|2
ztw3s`ibiUsU)^JGmmf1Dv$oJ6C+e@M{33sqbMA36-&NeJzkE)e^*8*K*;E%wo@Qmh
zOU}L$@g&tBI;k&=RU<nrb9;{<m;1E%S?$feT>P&;aasb*s4tGjVjm<$HYSjxeIc>e
zcoxOWEF}Y)QsR`XzSi<Q_jTt%0AIPzGDwmawc}z+j)dF({&w6yCp-Hg6MDZD3iL{u
z{Z3z*6#V6QFemka8|63rB7$x*<T{WU6D|DKnq`Ba)F^9#RTbxAebeDL6q(6*gpUB3
zBCzXgkPspukCT%+uIuo;{fr9MYv+Yaj{~>9QQF)xJ6DXzj}sQkv@T}iUwKXh%>AuP
z|N1OVG`s`oOAR;LIRY1dg_&=+gJ>wvFK-QwUtQVRub>&qtR?tsT`_xkW^f+zM}p6@
zC7`@B-j>#Lr5}b6Ko?DSI8yyctq%w*fw-Z@)43ZVkKd`u$*HH#$DAlVTYts4dUoF3
zjCE%t&`5-fH&$H=3b&f20FY8Pk^FSAJkewK4kZWE+B__tWRt4Fs5?Z*p+oH$x%GUE
z-=SqQrMS3wYATPDd61!=)c>01v#r?mTAmem@2f8dOgG-O>ox<q;d6R%)|K3#08sl1
zs-BP+cAPdSk%?=d=$Iu@^A-7eC{~>je&B-7cX4jvC^hIue@eEYxG#PF{A#HMyB!OS
zpnRjG4{zrF_;?xa*T(axsqB00<H$*t|I#`cX7DGPq%h(<Xvi|1+4bom2U+pP9BpZ0
z=7R7drH8*HeH!Qc@}2up!@*svusj(4J?LFfka=CEv)ovtO@!q9++9vl^5Q9^@2&sg
zTD8Vi6X*4d4A}ka`=k4tregngC)uX`EF|afgt%Gj#<Wofdmma#rjGI-0u1L=B@9<>
z$T6f<WG5bgx{TVhtLLTzYy^3Ec{4M!yT%p!@zGH`OG`r>V&xyE3lx=u8pck)2a>>n
zMgq4e><v4LOrUqQNBBt}x|<BvW{*Xz!+kl#3WFvn6Qb<Ylk|;k6P5VBq7`PkGORu-
z9cZU1dhUkhIyPyyGlkOz!v*~MdjFzZb}T}oPBSUdcFp-wb!_y|?s^xw?4<-F7>x7l
z0$+j_<Kksct3M6%0CQ$?$Nng8mDF8f$cVsvqe0G~jOa%HI2Y;dv3&o%5vTCV<BCo3
zs(=GdfWM#Hxc6m826#U_>M4Z+ZgTn90*8XQY$WO;_|=xr$y(0Q(P~pn3}*$i{1+Y^
zlA!MRQJrQ?G!GDC8aF@lHlusS+f6DwpqV_E=YIkapk9-LsWLxz#Ru9vqSyFwr#KvZ
zMhwXnNacS~Gz>C4Q{!ocyKGeKcHDfoE@gQmPT@_=#xT)}TMf^s%grIPqid?R?#JCE
z5Vmn_+$)@As<z~rm})FgCl`3ut7X7}r;m4mf1DltwwJN`r*p=vJ(6Mv`FIN3{0ja=
z{tr{_;<sKy<`|dU6E~;x*-XL^N6pS7N93o|vXzKmQ$q*WvDemhCcsdTrU=Rd*NRY5
zQ?Acbk4`9G30~m`#^Lsct&4sXEb97<^74%G_aLd5*-kCkSVU3ehM?Gk{dMuEj!ij3
z$y}62D(*Mn_%VQq50Z?$^-HhAtHhDgs?t(ROG}m%cny6|>VPVp=9DC#Ld*?{`K1g;
zzB~G))yn0%-v@THm1XqEPk)zSnw7dBWi(aY?P$UNkl80{btL(m6if4Dn`?*ZrpNw2
zf6@?xYVIVje;oG-z50Xpv~Q7rubu>bnYEQ}aR;{eB+}zIAT9D|NoKyaZQ0)EfT-7R
z_DOfzxuzMsO0t)o$hjS|PC~HB$i_u1Na+QYIk!s$c&IMd7*%u}MU)JBAPVx8z`0o8
znlHxU$KFc7FxA^w+Pc59EArvZ8A%5IP8uOAr0Qk|73@N!RWgyNw^m%(I}UcP2sc~K
z{i`;l0X;S_2%Nx4c*Jmp!ql2@h|nDvmjrrk%-T}^qomnkAI7g<r5nHQ7(^L=-Q*va
zoc`sA6{R32XlQsimP{CI*aZ)=Z*FUDZqCpDy@29zF)0Oiu++#<car0MbUUcv$L6rw
zJ~A?La1eeDsf8W`3<)bMtG<ZX#BB6WF9PzddlKS;@M?>eQhW!u21w_4{Bc06Pby~v
z2I3BaAd!Is>+?(DHx<HN*qz5jH@!zep4adonj(dzwUx;&n>O1iGyGlJTVnnRuRjX?
zPk5@|KuxZ8koo50$D-?+5Bhet?qg&9uow&68{dw+ood{pV(_o^vynYr+JL}7xcbGc
z1UB1huMiu$PT7Z4JbD<sQiT?(6emgtqxyBYR<MD#mVVqm;=~l0N6i;d*ersFxdGHp
zZp`mtqm+La_Sw(}S_h0b1!5!rD2gN?<yC*W(;=BwQGu->Oc$%h%A)-)rOmUcZrQ5q
zf2#xb@pN;$Du@;0vs-FNygI7XZhZgP`Q>`qdHnwNe7;gEQIiKQsM|<>+O{cv{O=F^
zKPd*ru&3{4$TXw<_xyH5FZ#Gz?}>fvEPVKyK$4GSB%BscV6qy@1y)<+w|~OB6+9^3
zMjCrsj*HEv&+gagJFr8vJE|fSho*)=0=^ekf=IhQN>KM^pLrv<q{5VCRo`ig5<j;c
zt?ezJ_G1q&CFr$u$+d%jebwljhK!`za$ghqQSP}<>Uh1jTqOE#n@RV=hn6xs58=Ie
zmbRBehIlw>T1k%v&ExaJm!Vbir>u=XF*EP_XLJ9`K;7XtU@^&#noxN29vTfb*@4#X
zeo0m&qnX4BH4VMl#l_pp8h=5&m!ny{=U~_od((I9bZ7ppWqZbZo{-%I+yS%R)DExv
z*&dd^;T|CI29yBG2dNyPvkUgnT&pt<xKinwpYCfNZLJ}fJn&)GN(u|k-?x8Ff(tS;
zxF4KGnbFbd(jRU%#q5sA$-B#u{#cM;_}j~r8QZ=9&5ynp_))wS3nAf0zKCAA<e^$0
z_ILq8$&|Q%D3=9+6)z^4&@sWDRHU;=u*mP*C7D}J74P!n!^cU9B>*NCX!%S38~5ce
zp>;_+q`DfOE7#3F!ufut7F7!l8y@9HWf%sb`fkg<_d1#2A{2?5yV6>M)#V>8qw{Vt
znNri@KN|IgeH~azL8HEpQ?Xjr*U_Abopt7C56q-D-QMU*^6)I_LFId}Z~fpcQEbnL
zgM4ItzMGz_J`wTh`J~>vxWWJ|B8*|*dHGo3#IU|08UO8f#O`xR7#Nut52eKI1Xx(*
zq<n&ul_4b-3c^-s5jIyGB$0ICqghET&RQsKy&N?zH(BkmKc7N&p|?F8siUS`sF5<P
znaG`_??4MoL;mgh*Zp?U;5cF(ZL%7C<HNHvJ>j44xm^$X>BGVRW(0;o&iSS$;h5BU
zw77(%gmEBEqc`DlTcxFzu~AmQK!e$b_dKz3gO=^QQ@g&lHXFs=daDyBf~`ot$E{Yo
z7_UImns}#W!M!0!<30;M<GF4tw~)i}Lwi?b<CVm5zlC-5G9@)6N^RBIP7V`;?_FV6
z6FO3tbxnwORqe(ZUrbN{q_#kqPtrw5wMe7Garjl$>RcGX?KtM6v-AFz2UTm;R?~9Q
za#fX1!8Z~mbqiI6;hZa|#$z+>fz@eKyVHF#CZEs(wB8`OJRqQF7Qdjb3rCOYad*5e
zcWJV+=J5R|cb~O{GX-WwnQ7<o?~A>=ololtCGH%Ymxw~mg%^A2-RiXWw%zf$c084_
z&GvlDoJA~`vo!`Uzk?zC9FipLGgkD7GkL4z9m#Y<j+Ht}U?(<rASg2fZIgT>_Qi`A
z=LT7~5{WE!_KTIWmqR}ojx&5ySM@HeyDRj-)w4wv$5~%W9^+MQ_GT)}PMz7C8ti$G
zgus=KT4s`)e^ji#dK<a#tMF&Q5ye=8mX-s_I$TbqnBOoCe_p()nhj^##w!`87rKQr
zLWUp(ETD4X@GKt*F6~y0@n7E3!i(6KS54yv2}w!+Q}^oR`!h;nDPV2u^`(ZB`Clbg
zYv;|IddG);dXKHRkGFVxh!t!~45tz@ai>fL%`I=w6ZVk|k$B%5z^2ELh1tgy{gi#d
zj_W@wWw0SJ*Xr9X?|)<?FCm32QD32dlnsYfXI}k2N<l~FqvjAKX+emWmume0P?9Hq
zc6o}r)H!MC5#9{2w7%Bk?GSl$Ff2pb3g0tl(v4XU^O%m9`ZP3QQDy$+e3=6D<O*xg
zxSz>$LlraPsy~b7<m4WscUoi&dAeBLdIU)F(Y#lv*puSw+1uq~&Tp)*d&WQ{6!`J4
zqdsrSF)+AuT)ed;sm;L8T)ZfILH#+C=+|F?`Tc9%@?3$=y%LgwZ-NT=i(}25jk2AO
zXJ%?O2R5%HL->kh>xIl+g|MSv0O)(hpPc)tAF0<(g3T3Xuv7iAafcAl-?Ke-!9;WA
z;*Fx3s%35H^bv+Wf5n9P+qq2rtaSK4JF(Rd=g6bu$JQ<OZj&X1=!aFpjpUboyuHlF
zcBdh#AdOF>ZAKW}s)eS_n@-y|yz0*r*EBz)@P;i?ll<IdcE6Ft+&iiGl9Q_8F;)#c
zjw_6EatRg{C70ax`R;#R*CBkJ3PrfN*0FeiRn<4mT_>3r;hFIO6Gj}xKzoT!%g1i?
zZ%q|ZV||3=Uc<+Tus%eKPq@IK_y1DRIa3Nu_UCIHY;B(bY=BBb2QVZ+Gjnsqk<ZlR
z4@m%H;U8VV&F)@3>vZ+)bEBc)>;Dfr`~NjG{$D#6|CI;--!B~L`}*pc`SgDVD#Qtg
zG-=*}D%7hY4Cd$N`f7k7)vyl%|5>kmd&kow4A<fLy|L!!(FV$*+k!_yy}*u@6wwl!
zFM+(wz0X7eODaJ9ji)XvR^rD3;@Dh94t<-8Zu%LbnSd)LzuNTg<YOORNbw;k%c6pC
z;B%JDO#psUe!O8WmLA7%CVfE2Lkk_^wbfFZ$%cA0)U>_oiAK2zoCgSioJzWd-nCj5
zjtpU&o3S8&d@0V4@W4!lIeY2Z)ZixEDqP#lUndCAbF2XROmBB(;mDxcNCb$ykaxEH
zjv~w&Cm?D3I_(GqBK!UPe*r-7t{ou?z%)WOu~h$#F%I7xa{)0(I`6jNH>0UfEn1k{
z&H6kDiHqZQ)j?&&sMqWZ6O*itN`@vdDS!xcFT;ZNV6%^Lh=?E&NlyBA;_V_Rkcs*F
z>y^1c^k^@!Q4*qs2^5J!J$)Pp5#Wkv)+5ytVS<u8fDZNA!;1?L2ghEyKY1CHf`w~4
z`yZSh3@a9B1sdhQQ3HErJ;2S645?xhJXJ?Aa>^)przR>2z#-(0l?sJvNF|?Nk#o7x
zfQaVuSKypuDA$Mp#>@fGk|9KQE*9L_+PkSrVq3!f8U*SwREUp5If$$wC((AOF-T{#
z`v#y?;vj%Ar>iUXD)LCjTDBQZR0J(1s#j_RsV#&&G#Nv`Ot7nq4QQY}^Y{4TKj-dY
zz&X_xNMT$)^w{T>kfZv3>4+MqVk@eD9(xEe0;q$Z3RlRE592|=yxA-%$rX;+6Z=J{
zC@^H;Eni%let`)X!!9rb?P6zFRTsA7@--B;?knO&@q?hvX>K&^>V>yor|{BS5LM7B
z31a_p`$W7x5isBM<A*S}-3hQcan}F${01Mjx2xodVr~##03seXgnA&$YW!jSV3ii*
zMeHR4OABL}&Cs1ec`CL%s0-kJ`|~ATrkC~g19s{}GCWxaG)aaK`>gX5zv!{i3n6QZ
z(Cw<r)ydT;>!xJ2nVUgbnOt4N&l6-;?W)*FA^y_V2_*@DT1h>Re@rgY|8&ng?vDAB
zr>`TgJ~TpVMEvpsFatpaQaoj8%Q_;_Z-wu>N8j3f*;51@0JR{XDmQ7)q`Ml)B?fBt
zm7sPA<hu;LNn(-NZ*4D7M7F^R&>%zeXPF4X;hVzr&3y+z20>WLpp2m2nMZ$jk>4#1
zEbK<1k^qg*+=N#l%yK4us;ZA7Q^p!6F0A}6pB*~K1-wkyfj<C9Bpi8~a#QOz2bWLt
z04QdY*YiNQ;&9HPPC)?=bX>m<f=x15_6=t1F3lq7jf&$y<WxbVVQBa`*h)A6t1X=7
zm~4yB{4eNtmXZ6&NW8VEL0k|BZ{SA86wt80JSG`5#Bzb`uyZkpjJr@f!}8y7=w3;3
zQT6+nRAA_v%_J8~-qF(_y`nH}V|b9ta7><e_oWZ8GN66Racbtf_tfmZJ*hN%{nQV2
zXZu9MFY}9B2R;xBOm{=QS>kufkQnh+2NT|IZ9?o<VQ6bo{;WX^Z%WfkyN{SoSOu!R
z)QU<&N-0jIp%mt&%B=x?FW|IrnyBJ}&Aw`-rNbUf3||Ym8i^*PFOQ#}hOWXM+b#Z|
zVy-hPs$^UD6BQ9e$tVb8gQA0gC_zMMhaj;5K|xX@h~y+u@~B93KtMoZi%M*0kenL_
z5s+*H4Kz7PkR&-Yyxq>c_3pa!=DoYt{qfF^rWbuqRh_C`-~RSjRlCLTkjYSc^_4oa
zpo~-DoQXspe`7D~J~qQ0fpD!~;LK@{Pn>E&>);RwC-e~nV){A%K?Gt|5fU*Vwvn)e
zgYZ@S9zIi0P{1arn|TL;_!ZCG#LtfujuQk4tMX|BBp)&UDM}?XKmXF|YKTC}5t&0!
zirs*~KNk}j01wwF%OMaqO!&_r5U)j|P9PAla!#!^>>p~K9dRVzCHPSa9yEVjr*}+D
zPBu0$@baXG2aBdh_`ABgT*>yOF)v?c!oLum8BaYL;5>|(*&73>Iau|y54v%DnH%M|
zJLJ7+Z)ex?`Lox^W7>kk!tQg>g6WPAD(3|R1Y~4<zfUyL(a{0qdwqM~@cHxSvGMU3
ziBhr<l8uc`l~Q*cC5FE8<I>Tet*OG~4KF&{+uz~180wje#(7fw2&B1g^Sh=PTv&={
zH73tG!A9KqId0c_Z<|D@*&GW)e#tW-SYmx8T${vNRie*PrX&eX&A>Cm8ejkNG&?){
zrCA4RZ#m*WXbw|*_8uK$&|AGcjE%S)em$Y1?BO`*dW*rz>gvyfzYSKFYGP$&yw_Qo
zh+<H6-KZ7@iHwPfu~GSW>A8?$$#Y&5#&~)Pdcx4q(95f$;@+(mTYH06a!1FA)77Y`
zsIEg{W*A#&Dh=Vc%<jhWetuT2AJ=u`Vw;#&3&9H)9$w|nBkXyps;X*eXediVhqKO%
zST=-)g`H+>czzPLvLu!`4%Zx_XAhX-hrY~oE+z%AUbt|<<!jp=upq>(tfzTYn{nm)
z=Mgqa$K&y7X^Oc9w`qq(k0B69tBr+0X^({*9F9xerfa(MZH(srHLum_J<1O8%NN*`
zOse!)>`41{u+qbkTrxk=6pg%W-uU9llLN(d&QvqHtMK5#BnXpio9n^zuv2%qhC;PJ
zd~8gNxR}_%a2;*!qeqYKU15lcizAUpT!E$uQ*csf*muKt&Uftn6EL}76{#njKF3+v
zlS}Le${r%MO-y*7|AKU$YKf<zWjq+ArKniXVdC>}Z+GjLr-!>qq%h?Xv^p^dk7um;
znEj&_AN>!<{kXT|F%9j@v}*5-A)l4j<~R|sz(=nO{P+FpE=`y*Ge_2D6+dhcLm=3f
zv6aT|l`>9?D=*H-+m)#8cb1ap%gf7o{WeW69X=F*(GLCXx9atNTY2`}5L<XtS3Ng3
zH`t_&%ggh)%3T+DthuSFuiD$=UXvKQ?h+Uh#&2s9&T10CdQ?<YzP2T=cvV9Uva+($
z(v?+J*H;JCiO)kq-uKyI^7IrK>Ka>G80_=;@i0~$#voAPLD9|4&CSZPhP|BL8w<&o
zAA}NTUWRcQrdz}LE84Nn9R0#{jpT&*c+X~`(w0}EZ)<AyK8CQQE?RH~4lizO`8*zv
zjg2iG@)9ii5Jl~mA8k(E(f3~e*3{a1OrP-j%^Nv|>pbnXk7>d}Lyd&>7Kf?@fBso+
zF_rP?kgyOElP9@kKp!@-a93ZyFO*HN>t_Vw2-cJ{&@>6<@Aom735<nd=o@!0pCRP_
zxPSj`s|DBfM@vGb15q(Ccl7m7_OtY&b$Dd1ps(=p@fDfLM5@an$DGTbG&qz>KVm>2
zYVN?%G<$pdkzTRZGxC8~liPN8ci{-+A@2jD>F?i*I&nV=z9=XtC?zHJ2;C@#K1*}R
z$fZ?0mSBDDPka+wC1MeX+S2XO<Njq9tqBzs*Xk|-ur!397DR@GgaDz$Wo0ezP##0(
zA@-|`Mw?PjBekggt@g6AvJSOhKJ0|rNhA(Bl=j)dAktT*7Arz36U48$PB3*is)m*I
z?VoWy?C_HDm6-LHz++q*Dc4W(@}gi?9GBgEotrC^&sw$)BK+dPL`Cz>rO`lBF$oD6
zJ4RkvpNYoER9w$Z0Pfe`bA5#w+B1Wd8!&87GGIVqXz2#`k75oU<(8C`6yy@${s}6+
zhZfLC>V1E|Aw|X;z>mh`z_TIb>3;n?ci6nL*{JZ`L){yakZ_90+TK1U?0B5-WOAp;
z*0<_&o7VxtFJ`8v-`pBeDBo=sdongQ20M_tvL(vS!NfY$-PLv8@ZM3P*!bAkLfLqD
z=UXM5?X@%1T7P+~6fsox@U78q*T;@DE{C!+!otD;w9xmfAy;Imjv_z(;rJ&wo7z0S
zd{8jZ^y8;b)JYY)KQS$+1aDZIU?d#ZJbCTm*heTAo&zA;S(}TCj+SRQO7&f}pD+Ko
z4j(%;@dx@nm|y<J`9J!b(23JThf+f!DsH7J!+yu?>^E<a#gK2S;|{W1TKDc<&P|&E
zgxg8*S<ycR457gO7q9h)ul$#t+~3_^X)rAZO9?^8d3C9PYJ4^dQhp6Mlo}iWFZnZ=
z^s^t=2c5c0!nxEI%Pz3MPk*}Vf4BNPWj_uXypeC6w|KXsx36VX6rMNA_>C|P;#c=G
zbc%UXvG=k5n1QaMcCq%IU3;@_g^8{i<JMLsJYm7Jm<xdj5Q(b15ADBs#7*Ck_YNzC
zvN*1R-P`b7)W_!Ot7Bgwk$(Q<ES>$^mH&abQK|4fK}%Ox*VEH;NeqQTncw1T6>2a@
zh>#lJ-pCi|vs;!hjF_Dj&(NS4E2u}R(RG$B(dLz2xC{Bu^!TDw8Yi8z&Qg57&rsal
z3=hONT>2<>R%Y-)0*1JXVSg$dfFz!f|48as^7Go>#~1)1nP+Yq6z%G^$K%LIq}eGm
z)qDH6)DMFLKI#e#It)xY>hcQeb*!QvFDa@M{a2@Psf$oke-J;b=&@sqi>Umi-$Txr
z(H}g+6gd@}%_E~N;t+JwVnh?UM{n9>XSGYN$AHoJ@uxOg*~@J8Ed(VbMx6Rok8l;G
zjYp<mFzb(E-WPm#(b>ANMOkBO@x^5X!YV^~NZ?&zv|D2J&=sUv!ebwone?CPPArF!
z$V-!PXtS#fg+dc<W>?_VTxvDt!oUc9mE|It9*JG7Jwbf=LX+uIeNl*YX#4P~`(Yc2
z7#$yu^+pBfz`bi{&IpLF`9GP5CkA^@hdsy4Yz21mIr)rczc4NA<9;liA@BaVO*+|k
z(v0EM_36|-3Vp~gU{S1iI>npAH9B*%rClTf$JO~xK5@SLa{GQn)UMS{LvquIMLe-@
z<J10zhD)>P;#T3qhm_OdLvhtW$I7nprv{%CclKPm!Zw~np{X~OBJ8aeN`3GSETH8e
z+r6h-?u)}5Mj);+s+^A+ONzU!RVz;N@gK<DCx6ivLTXr=RLWd%WrJp1jMD0QPmX40
z3CaR;)Io=z4W9v51(3Kk8E3+%t4{P;>(<NANT#9y@EaBZjesH;E`_*tfm9C6qftly
z9Yf$TRu5rC6WqI?kdQANPNF@|vr8V{lc3h|Q+5%GuG|RkxiWQDKp==6sKFpFWm@Uv
z$&>Dt6j*$adbBhW$)_X-mW_C$v=go6;D9$^mU5m@wy>}m>78+1Ul@dg*RutTR5sN4
z!9yL;Rhm0|#M;izPEAd%ar@B9R4YqlXlN)gLo=wy#CyJYW60ZadvyjV14FKdEqGm7
zZ2S0fCOvz7RZn_1&`gq;JYl<i>=mj1jpa~Jk8UyREmQtf{X&c6Yi_CvJf0JgfY{m1
zNF*}{hsVM|`G~cSPBd|bL7z}N?;fTM9u6)mO0=e|l+WhU%L_XF{lYs1xel$18;R-G
zX(2TyR}2z$R}#`^Hg?2M>b@e)zOlp*h8+uw{nfJTo<#<*i(tO3Y$@=IirzA=JTEAS
zD=4@uBBH6#20KA}d#_q`{8X&+uBysND1<Zcy?ImA@^}M|elV$F+}hSwO;c0vzZf#t
zsZ-pHO4?DvymRnw>@q8d{kU%7B3W*pR|A#;-J_;PKe(?w;Q2L~OTsP_k9V-Q_nj{$
z1s>;>y;qAUrh^f>7Vr?QENc4uK`b2j_c~U07g829M)F~OzBfsOTLD&hTxM0v)^^T~
zWW0ehp{V)-OyB6T@@MV`4+A?wBcD*O15`La;rn~<X|Rxd&cBH~{tlG?aywfS|7~4e
zivX=SfZ%KTJA|;kSwKA?nzWiLD=YGl76ft4m_PbO^Q5*K6~@m4EW+Q&e~hdCU9cAU
z|MeCB7`_PtLCVQ#Ig$#V0^xd@+Aw{6{mIG64@BTtFBD;RXmB*~PdJx%F~jh1gib7g
zFTP>^Z+F38M&^MYy>a8bJMJu0Da;DP1X)yW-+reH15%;N@6m9Sp1$?OECBBffiQDw
zkQ;pQFJ=78gEW_2^O(xZsW}LYJqE?$fd7pLKmwa%g$l_cRrc=g?$*{bX&eeHc^gLZ
zb>6x5Ys(L5yw*<TQTyo_fKT<(@APo~9w=vOJ@7cPLuKx}cl<@1&ulbzJ=;^1&_scB
z>rkoJ`z=XH4b=Gcp4+^p90cEzk{cU@eP652@n!yV=K^)#DA~^ke$pY&i_-bxDIYjY
zS~hmt!-8KS1G!~NE(^=L+}M|!gifjL@5cr>ySQ{m1viXK`R#62Zw!%pdtb%Gtku#S
zH@CFJp(B^JzDD$P)z}BOvhH_|`?HFQT;$}~Gtw`ANe!7Cpnw`}09G%rZJ2&6%*>E<
zVXgOm1O%h?4Gn_!TU%R5Z3sdQ4Y>^bH%EgY4pUWAqh;jgG$5P`I*#(2$|0{xhFwS>
zl~nvFCNhD&!F^d>=lROlb{<p>afaJ<Cg$|VuPa-Wy}-c3ZXezN5EdnD!0M(fHZM09
zZJd98s<>o2p7eF4&+gW0tuCjERZosSI0EKKB_2=nZ@uql3xb2y6T~h&GP-@6x_bq}
z=84J4XGOumWk$kGS(%yBfX4aL?v+a6-0|`8Gfr0lr}F*8+mgvH+b!(G)zDzh^kCEA
z3+WF>wL6~OFy};u^HfJg2OTo$eQaQ8h$eo^40*%*14g~7F^R3|u)(lOmqTWyq@*M}
zE%+!Ooe-v~_v=bXqvQZxk;meYtnc>fw>}*Ob>iv{#bis80(mvY*3IiKfoJ#>vDny9
zmYuD8(Coz-vry9nvnve5BLk=Tokt~rX_(gWXsW6n^|V%kqkeN$k`Zzc*KfCwdwQmv
zu8RGE-!?;5WnyfsH<?vcRBoy5n?k}LiA1oZkE2did)spmp?|J>V0=;555f^vCedSc
zM{e>miK0ar-q&Asta#{>RJdOcbD~3$Q{`{K7TEpQ6mN%5&rb%LLW&dD4^IxgfB$|~
zuu`i%<-VQLydOc*X-(k7(*qu~hs<6CKj*9o<^*F&0b`MH7|4uO(AL(TYD>oH@(IXL
zdYJZ$CmK8T+LN}<pFP`tl3ZDnI`Gw&jY;%fdok+Nbt^_u5QXfEQFLw`B%>9IEq=>y
zsi?$VUVgl!w--)X@64||4pX@qy9I^+-89kC3@gjh4FO|^Hmgj>arj&RF9=Ql2=~aX
zlmAB$U={c$+Ve+4VD#VEmFzJ3`!gUPw{bmxcyRb`ROXawP%J!EEHOA$?fgRvdEen3
z%QyFl%kj4VRhY(7pfTAT2~JX$Rm|HQT=dI`RZy51k3c~rG*s{)tYT@+fk0W$!1{T0
zW!{?oopfGUm<MDM5MjEW<t;aWYy>y^ZO>$MKLg>`$Y*Kf10Ycry$N6c)G?g?8^@vi
zgtMRFtJF`~Q^)4?LKTLqq6IYD+S+QqV^Kfud9^`yovktzr&Z{N&oE`Gf)<)|UKO?o
zMd+ykKQ1Lt&gl^}Fqjl@V>lB&ad9`OMsF{!x5jq|Xc0aiB`)6WP-<dY2&J2ftKDwn
z(sjAr3LjdlEnGc$tRXd?^X}d@HnVIJKYqNj;#a7VESWR)dO99Ht?GP1Dfm+k%v*>I
z0CevDB6+kFB`ByS<eQj}8FRfx{l~`F<E5w^UfG<fUo#UVb1b0TI-A&~9ck|&r(x6p
z7B#S!@0XH)?%0tz%Gq5kDrYZ0;hAUNjSOf|VZiWk-%4ON_Z^1lyz*&tAY?9;gW#KH
zpD2pv6<dot`uWu~PkMn6)cr#uBiauMl4Pjb2*UL+p3bczJNxIEwzy+;e+;j|F~JsF
zw~AKp$v;f&TR7b4`!}8}&RFO*i(P+_HN>(Ya7aj$yna2>W9WECI%g6B@Am5}EueP+
zhJv)_zA@;5JFR=$*w`0{C^}KX9%$lhLb2UYl_%ug0e}M-6+C}_bHRgvb9eRUiLjjc
zk_LR}GQ)VRp#$V<(um-Z={iSmZDsXnq5P{XWmSjiBpA)p<Fc=!qTWd8cV2-CK|O#H
z^^NnBy5X3<385w}?X09^DJT?F?J?+6*}1M6_r)Q~_hu21yZQI(L)q%<>*+a!LG#j0
zy!df$tu0x41MJucYOeBJf!Tf6#3WBbKYV!6e>N-Q%a<=*hnlx0qBT?e_qKqSgOG<>
zs-+2I7b=(_w71Ld<dEikS6U_Y@J611)g+8JdOkl^0PAyESl9u8OHdDg9cXQsNKK~u
z-A-yUrwG|?<y?c#sn$eE*a+l0^93C3>Kw^&{L^nXK|{)N$qOB+^1!EC8XHeUCM6{W
z1O)IYg=eDK3v5ZkO!1c=sH&^Cz%6kiCiuF2pBzke1&D<tV~^yDds0nJO@!S^5f>L1
zMRU-#04hPv(MhZ*@9F6Q(PcycwG~2~QCC+_0t}B8GVD5MTTWRgTec>!f#|@?>pz!U
z$|K`-dp<v@(6Svgu(lK#L*x86RMiKv5J4!x7<3<)v!bHnpN$RVa=7X|(U>Y9r0@<l
zJS}bj-8Hf=ctP3OIL>|5sVV9#JO{Qg4B5K68Jnpw2)sIEvBKi$bs?La8xbveD12j{
zOzQ8qsrJX>a5##e|Nb6@CIdR*I(Ql|1>wblmq3vD9x1xO`#^vlmuJ}VB~48Lo$xX~
zK1=v63>V6Ex&u@v%k~swEv@`}AD)0$#G$?Tz~n)ah8;yRsGYv{boKblFdlHDp*PLm
zx4uF^E^zT66mNrJl>z_v-C55<vpa)Envvo)I~v5Rp{%S$x;PJA4o+P}SXe_<Rhv{%
zJ~d?lnF9-rH|IqqBy<%F3=D*A3y_52{oR#z@9R9x!*eBoVbF_vBeW<jj-2Uc5yGlh
zHE3g)BCAfpyeilw#UX;pk(l{SI$av$=C+n3<#M1Quj>0}P}xfy1}k8V!ra`PTh?c@
zQ;8R?sYw_O<P!G3C7Irm2zC)2&BEyfLL$gddqAFK?0WN?F~sBxD|<7V{;(IV@C!=o
z*=5J}6d6UxgEYKf$GfLxWON<Wx8@2V(5Udaxvg_+{qycj);2cGOc}(ib$vA*9paNi
z^q`($y=E`i4)b92^z?LfOCBwo)m`cm7YvPvhzJRZzho-s<5Lx^4wD*eK^7}=?%YlL
zZZ?(7@wFME<<|Gf$Kw+d6Q`x!0pQO$VYMtQrj|zQpxdp}_MLuSS)U`d&%4v=p@~Jk
zd6Ew>pck)s(Iox~N^e+52xLzm26`mojrK@y5#+$0_6EPOvp(;wkGr!9%D7Sl4{$U2
zAV#)G_()4j)8Em?H;jX+m(QdpLUM~Nynb8wY3YQ7gahP0KLY5vn#tGJNHF)-NG5qD
z22WZoDnX8ZO0G<|?zfXbYmcxY>D;b^#<|UK3$~*Hc!AYJ7aN(tf=7D_mGiGVXvTZM
z{01(((`>T$y?W7iaitx#Km9fWuqoxo?;EX>!wftk21S`LAKA(JT2nF*v8stw7uod{
z+$bx3*W23*WyJ{U$*%X_hp5)EYWY*L$557pz8VngU?x7-U=ZtJuF#_T$3c$1V%cU?
zXwf<;r0W9HyL$N(7s<rmS5EHt*=`&xV6Y7AK6?>RQTt9M8e{{uUX%PjJ8+6Tk`77z
z=4W7@f(b+Sh*Ym?gU^BmpzG$1CyMC?d*B6OVP(2d04hk5us2W+V{1u~*@?GI*;<b9
z=VoDv#$r+TKQR>7$-#Y|p5=oM!$sCQubEj`>Yn!KUiSPxQBhHW-CNBFG=0u3^__#T
v`K&)xvQ7+L3|h!*T71O+;zRzrLbi<(A<G=Ol7s}M6oF7t)KI|7-Fxz%L)`(6

literal 0
HcmV?d00001

diff --git a/speaker_encoder/utils/__init__.py b/speaker_encoder/utils/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/speaker_encoder/utils/generic_utils.py b/speaker_encoder/utils/generic_utils.py
new file mode 100644
index 0000000..1981fbe
--- /dev/null
+++ b/speaker_encoder/utils/generic_utils.py
@@ -0,0 +1,220 @@
+import datetime
+import glob
+import os
+import random
+import re
+from multiprocessing import Manager
+
+import numpy as np
+from scipy import signal
+
+from TTS.speaker_encoder.models.lstm import LSTMSpeakerEncoder
+from TTS.speaker_encoder.models.resnet import ResNetSpeakerEncoder
+from TTS.utils.io import save_fsspec
+
+
+class Storage(object):
+    def __init__(self, maxsize, storage_batchs, num_speakers_in_batch, num_threads=8):
+        # use multiprocessing for threading safe
+        self.storage = Manager().list()
+        self.maxsize = maxsize
+        self.num_speakers_in_batch = num_speakers_in_batch
+        self.num_threads = num_threads
+        self.ignore_last_batch = False
+
+        if storage_batchs >= 3:
+            self.ignore_last_batch = True
+
+        # used for fast random sample
+        self.safe_storage_size = self.maxsize - self.num_threads
+        if self.ignore_last_batch:
+            self.safe_storage_size -= self.num_speakers_in_batch
+
+    def __len__(self):
+        return len(self.storage)
+
+    def full(self):
+        return len(self.storage) >= self.maxsize
+
+    def append(self, item):
+        # if storage is full, remove an item
+        if self.full():
+            self.storage.pop(0)
+
+        self.storage.append(item)
+
+    def get_random_sample(self):
+        # safe storage size considering all threads remove one item from storage in same time
+        storage_size = len(self.storage) - self.num_threads
+
+        if self.ignore_last_batch:
+            storage_size -= self.num_speakers_in_batch
+
+        return self.storage[random.randint(0, storage_size)]
+
+    def get_random_sample_fast(self):
+        """Call this method only when storage is full"""
+        return self.storage[random.randint(0, self.safe_storage_size)]
+
+
+class AugmentWAV(object):
+    def __init__(self, ap, augmentation_config):
+
+        self.ap = ap
+        self.use_additive_noise = False
+
+        if "additive" in augmentation_config.keys():
+            self.additive_noise_config = augmentation_config["additive"]
+            additive_path = self.additive_noise_config["sounds_path"]
+            if additive_path:
+                self.use_additive_noise = True
+                # get noise types
+                self.additive_noise_types = []
+                for key in self.additive_noise_config.keys():
+                    if isinstance(self.additive_noise_config[key], dict):
+                        self.additive_noise_types.append(key)
+
+                additive_files = glob.glob(os.path.join(additive_path, "**/*.wav"), recursive=True)
+
+                self.noise_list = {}
+
+                for wav_file in additive_files:
+                    noise_dir = wav_file.replace(additive_path, "").split(os.sep)[0]
+                    # ignore not listed directories
+                    if noise_dir not in self.additive_noise_types:
+                        continue
+                    if not noise_dir in self.noise_list:
+                        self.noise_list[noise_dir] = []
+                    self.noise_list[noise_dir].append(wav_file)
+
+                print(
+                    f" | > Using Additive Noise Augmentation: with {len(additive_files)} audios instances from {self.additive_noise_types}"
+                )
+
+        self.use_rir = False
+
+        if "rir" in augmentation_config.keys():
+            self.rir_config = augmentation_config["rir"]
+            if self.rir_config["rir_path"]:
+                self.rir_files = glob.glob(os.path.join(self.rir_config["rir_path"], "**/*.wav"), recursive=True)
+                self.use_rir = True
+
+            print(f" | > Using RIR Noise Augmentation: with {len(self.rir_files)} audios instances")
+
+        self.create_augmentation_global_list()
+
+    def create_augmentation_global_list(self):
+        if self.use_additive_noise:
+            self.global_noise_list = self.additive_noise_types
+        else:
+            self.global_noise_list = []
+        if self.use_rir:
+            self.global_noise_list.append("RIR_AUG")
+
+    def additive_noise(self, noise_type, audio):
+
+        clean_db = 10 * np.log10(np.mean(audio ** 2) + 1e-4)
+
+        noise_list = random.sample(
+            self.noise_list[noise_type],
+            random.randint(
+                self.additive_noise_config[noise_type]["min_num_noises"],
+                self.additive_noise_config[noise_type]["max_num_noises"],
+            ),
+        )
+
+        audio_len = audio.shape[0]
+        noises_wav = None
+        for noise in noise_list:
+            noiseaudio = self.ap.load_wav(noise, sr=self.ap.sample_rate)[:audio_len]
+
+            if noiseaudio.shape[0] < audio_len:
+                continue
+
+            noise_snr = random.uniform(
+                self.additive_noise_config[noise_type]["min_snr_in_db"],
+                self.additive_noise_config[noise_type]["max_num_noises"],
+            )
+            noise_db = 10 * np.log10(np.mean(noiseaudio ** 2) + 1e-4)
+            noise_wav = np.sqrt(10 ** ((clean_db - noise_db - noise_snr) / 10)) * noiseaudio
+
+            if noises_wav is None:
+                noises_wav = noise_wav
+            else:
+                noises_wav += noise_wav
+
+        # if all possible files is less than audio, choose other files
+        if noises_wav is None:
+            return self.additive_noise(noise_type, audio)
+
+        return audio + noises_wav
+
+    def reverberate(self, audio):
+        audio_len = audio.shape[0]
+
+        rir_file = random.choice(self.rir_files)
+        rir = self.ap.load_wav(rir_file, sr=self.ap.sample_rate)
+        rir = rir / np.sqrt(np.sum(rir ** 2))
+        return signal.convolve(audio, rir, mode=self.rir_config["conv_mode"])[:audio_len]
+
+    def apply_one(self, audio):
+        noise_type = random.choice(self.global_noise_list)
+        if noise_type == "RIR_AUG":
+            return self.reverberate(audio)
+
+        return self.additive_noise(noise_type, audio)
+
+
+def to_camel(text):
+    text = text.capitalize()
+    return re.sub(r"(?!^)_([a-zA-Z])", lambda m: m.group(1).upper(), text)
+
+
+def setup_model(c):
+    if c.model_params["model_name"].lower() == "lstm":
+        model = LSTMSpeakerEncoder(
+            c.model_params["input_dim"],
+            c.model_params["proj_dim"],
+            c.model_params["lstm_dim"],
+            c.model_params["num_lstm_layers"],
+        )
+    elif c.model_params["model_name"].lower() == "resnet":
+        model = ResNetSpeakerEncoder(input_dim=c.model_params["input_dim"], proj_dim=c.model_params["proj_dim"])
+    return model
+
+
+def save_checkpoint(model, optimizer, criterion, model_loss, out_path, current_step, epoch):
+    checkpoint_path = "checkpoint_{}.pth.tar".format(current_step)
+    checkpoint_path = os.path.join(out_path, checkpoint_path)
+    print(" | | > Checkpoint saving : {}".format(checkpoint_path))
+
+    new_state_dict = model.state_dict()
+    state = {
+        "model": new_state_dict,
+        "optimizer": optimizer.state_dict() if optimizer is not None else None,
+        "criterion": criterion.state_dict(),
+        "step": current_step,
+        "epoch": epoch,
+        "loss": model_loss,
+        "date": datetime.date.today().strftime("%B %d, %Y"),
+    }
+    save_fsspec(state, checkpoint_path)
+
+
+def save_best_model(model, optimizer, criterion, model_loss, best_loss, out_path, current_step):
+    if model_loss < best_loss:
+        new_state_dict = model.state_dict()
+        state = {
+            "model": new_state_dict,
+            "optimizer": optimizer.state_dict(),
+            "criterion": criterion.state_dict(),
+            "step": current_step,
+            "loss": model_loss,
+            "date": datetime.date.today().strftime("%B %d, %Y"),
+        }
+        best_loss = model_loss
+        bestmodel_path = "best_model.pth.tar"
+        bestmodel_path = os.path.join(out_path, bestmodel_path)
+        print("\n > BEST MODEL ({0:.5f}) : {1:}".format(model_loss, bestmodel_path))
+        save_fsspec(state, bestmodel_path)
+    return best_loss
diff --git a/speaker_encoder/utils/io.py b/speaker_encoder/utils/io.py
new file mode 100644
index 0000000..7a3aadc
--- /dev/null
+++ b/speaker_encoder/utils/io.py
@@ -0,0 +1,38 @@
+import datetime
+import os
+
+from TTS.utils.io import save_fsspec
+
+
+def save_checkpoint(model, optimizer, model_loss, out_path, current_step):
+    checkpoint_path = "checkpoint_{}.pth.tar".format(current_step)
+    checkpoint_path = os.path.join(out_path, checkpoint_path)
+    print(" | | > Checkpoint saving : {}".format(checkpoint_path))
+
+    new_state_dict = model.state_dict()
+    state = {
+        "model": new_state_dict,
+        "optimizer": optimizer.state_dict() if optimizer is not None else None,
+        "step": current_step,
+        "loss": model_loss,
+        "date": datetime.date.today().strftime("%B %d, %Y"),
+    }
+    save_fsspec(state, checkpoint_path)
+
+
+def save_best_model(model, optimizer, model_loss, best_loss, out_path, current_step):
+    if model_loss < best_loss:
+        new_state_dict = model.state_dict()
+        state = {
+            "model": new_state_dict,
+            "optimizer": optimizer.state_dict(),
+            "step": current_step,
+            "loss": model_loss,
+            "date": datetime.date.today().strftime("%B %d, %Y"),
+        }
+        best_loss = model_loss
+        bestmodel_path = "best_model.pth.tar"
+        bestmodel_path = os.path.join(out_path, bestmodel_path)
+        print("\n > BEST MODEL ({0:.5f}) : {1:}".format(model_loss, bestmodel_path))
+        save_fsspec(state, bestmodel_path)
+    return best_loss
diff --git a/speaker_encoder/utils/prepare_voxceleb.py b/speaker_encoder/utils/prepare_voxceleb.py
new file mode 100644
index 0000000..b93baf9
--- /dev/null
+++ b/speaker_encoder/utils/prepare_voxceleb.py
@@ -0,0 +1,219 @@
+# coding=utf-8
+# Copyright (C) 2020 ATHENA AUTHORS; Yiping Peng; Ne Luo
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# Only support eager mode and TF>=2.0.0
+# pylint: disable=no-member, invalid-name, relative-beyond-top-level
+# pylint: disable=too-many-locals, too-many-statements, too-many-arguments, too-many-instance-attributes
+""" voxceleb 1 & 2 """
+
+import hashlib
+import os
+import subprocess
+import sys
+import zipfile
+
+import pandas
+import soundfile as sf
+from absl import logging
+
+SUBSETS = {
+    "vox1_dev_wav": [
+        "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_dev_wav_partaa",
+        "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_dev_wav_partab",
+        "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_dev_wav_partac",
+        "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_dev_wav_partad",
+    ],
+    "vox1_test_wav": ["https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_test_wav.zip"],
+    "vox2_dev_aac": [
+        "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partaa",
+        "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partab",
+        "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partac",
+        "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partad",
+        "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partae",
+        "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partaf",
+        "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partag",
+        "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partah",
+    ],
+    "vox2_test_aac": ["https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_test_aac.zip"],
+}
+
+MD5SUM = {
+    "vox1_dev_wav": "ae63e55b951748cc486645f532ba230b",
+    "vox2_dev_aac": "bbc063c46078a602ca71605645c2a402",
+    "vox1_test_wav": "185fdc63c3c739954633d50379a3d102",
+    "vox2_test_aac": "0d2b3ea430a821c33263b5ea37ede312",
+}
+
+USER = {"user": "", "password": ""}
+
+speaker_id_dict = {}
+
+
+def download_and_extract(directory, subset, urls):
+    """Download and extract the given split of dataset.
+
+    Args:
+        directory: the directory where to put the downloaded data.
+        subset: subset name of the corpus.
+        urls: the list of urls to download the data file.
+    """
+    os.makedirs(directory, exist_ok=True)
+
+    try:
+        for url in urls:
+            zip_filepath = os.path.join(directory, url.split("/")[-1])
+            if os.path.exists(zip_filepath):
+                continue
+            logging.info("Downloading %s to %s" % (url, zip_filepath))
+            subprocess.call(
+                "wget %s --user %s --password %s -O %s" % (url, USER["user"], USER["password"], zip_filepath),
+                shell=True,
+            )
+
+            statinfo = os.stat(zip_filepath)
+            logging.info("Successfully downloaded %s, size(bytes): %d" % (url, statinfo.st_size))
+
+        # concatenate all parts into zip files
+        if ".zip" not in zip_filepath:
+            zip_filepath = "_".join(zip_filepath.split("_")[:-1])
+            subprocess.call("cat %s* > %s.zip" % (zip_filepath, zip_filepath), shell=True)
+            zip_filepath += ".zip"
+        extract_path = zip_filepath.strip(".zip")
+
+        # check zip file md5sum
+        with open(zip_filepath, "rb") as f_zip:
+            md5 = hashlib.md5(f_zip.read()).hexdigest()
+        if md5 != MD5SUM[subset]:
+            raise ValueError("md5sum of %s mismatch" % zip_filepath)
+
+        with zipfile.ZipFile(zip_filepath, "r") as zfile:
+            zfile.extractall(directory)
+            extract_path_ori = os.path.join(directory, zfile.infolist()[0].filename)
+            subprocess.call("mv %s %s" % (extract_path_ori, extract_path), shell=True)
+    finally:
+        # os.remove(zip_filepath)
+        pass
+
+
+def exec_cmd(cmd):
+    """Run a command in a subprocess.
+    Args:
+        cmd: command line to be executed.
+    Return:
+        int, the return code.
+    """
+    try:
+        retcode = subprocess.call(cmd, shell=True)
+        if retcode < 0:
+            logging.info(f"Child was terminated by signal {retcode}")
+    except OSError as e:
+        logging.info(f"Execution failed: {e}")
+        retcode = -999
+    return retcode
+
+
+def decode_aac_with_ffmpeg(aac_file, wav_file):
+    """Decode a given AAC file into WAV using ffmpeg.
+    Args:
+        aac_file: file path to input AAC file.
+        wav_file: file path to output WAV file.
+    Return:
+        bool, True if success.
+    """
+    cmd = f"ffmpeg -i {aac_file} {wav_file}"
+    logging.info(f"Decoding aac file using command line: {cmd}")
+    ret = exec_cmd(cmd)
+    if ret != 0:
+        logging.error(f"Failed to decode aac file with retcode {ret}")
+        logging.error("Please check your ffmpeg installation.")
+        return False
+    return True
+
+
+def convert_audio_and_make_label(input_dir, subset, output_dir, output_file):
+    """Optionally convert AAC to WAV and make speaker labels.
+    Args:
+        input_dir: the directory which holds the input dataset.
+        subset: the name of the specified subset. e.g. vox1_dev_wav
+        output_dir: the directory to place the newly generated csv files.
+        output_file: the name of the newly generated csv file. e.g. vox1_dev_wav.csv
+    """
+
+    logging.info("Preprocessing audio and label for subset %s" % subset)
+    source_dir = os.path.join(input_dir, subset)
+
+    files = []
+    # Convert all AAC file into WAV format. At the same time, generate the csv
+    for root, _, filenames in os.walk(source_dir):
+        for filename in filenames:
+            name, ext = os.path.splitext(filename)
+            if ext.lower() == ".wav":
+                _, ext2 = os.path.splitext(name)
+                if ext2:
+                    continue
+                wav_file = os.path.join(root, filename)
+            elif ext.lower() == ".m4a":
+                # Convert AAC to WAV.
+                aac_file = os.path.join(root, filename)
+                wav_file = aac_file + ".wav"
+                if not os.path.exists(wav_file):
+                    if not decode_aac_with_ffmpeg(aac_file, wav_file):
+                        raise RuntimeError("Audio decoding failed.")
+            else:
+                continue
+            speaker_name = root.split(os.path.sep)[-2]
+            if speaker_name not in speaker_id_dict:
+                num = len(speaker_id_dict)
+                speaker_id_dict[speaker_name] = num
+            # wav_filesize = os.path.getsize(wav_file)
+            wav_length = len(sf.read(wav_file)[0])
+            files.append((os.path.abspath(wav_file), wav_length, speaker_id_dict[speaker_name], speaker_name))
+
+    # Write to CSV file which contains four columns:
+    # "wav_filename", "wav_length_ms", "speaker_id", "speaker_name".
+    csv_file_path = os.path.join(output_dir, output_file)
+    df = pandas.DataFrame(data=files, columns=["wav_filename", "wav_length_ms", "speaker_id", "speaker_name"])
+    df.to_csv(csv_file_path, index=False, sep="\t")
+    logging.info("Successfully generated csv file {}".format(csv_file_path))
+
+
+def processor(directory, subset, force_process):
+    """download and process"""
+    urls = SUBSETS
+    if subset not in urls:
+        raise ValueError(subset, "is not in voxceleb")
+
+    subset_csv = os.path.join(directory, subset + ".csv")
+    if not force_process and os.path.exists(subset_csv):
+        return subset_csv
+
+    logging.info("Downloading and process the voxceleb in %s", directory)
+    logging.info("Preparing subset %s", subset)
+    download_and_extract(directory, subset, urls[subset])
+    convert_audio_and_make_label(directory, subset, directory, subset + ".csv")
+    logging.info("Finished downloading and processing")
+    return subset_csv
+
+
+if __name__ == "__main__":
+    logging.set_verbosity(logging.INFO)
+    if len(sys.argv) != 4:
+        print("Usage: python prepare_data.py save_directory user password")
+        sys.exit()
+
+    DIR, USER["user"], USER["password"] = sys.argv[1], sys.argv[2], sys.argv[3]
+    for SUBSET in SUBSETS:
+        processor(DIR, SUBSET, False)
diff --git a/speaker_encoder/utils/visual.py b/speaker_encoder/utils/visual.py
new file mode 100644
index 0000000..4f40f68
--- /dev/null
+++ b/speaker_encoder/utils/visual.py
@@ -0,0 +1,46 @@
+import matplotlib
+import matplotlib.pyplot as plt
+import numpy as np
+import umap
+
+matplotlib.use("Agg")
+
+
+colormap = (
+    np.array(
+        [
+            [76, 255, 0],
+            [0, 127, 70],
+            [255, 0, 0],
+            [255, 217, 38],
+            [0, 135, 255],
+            [165, 0, 165],
+            [255, 167, 255],
+            [0, 255, 255],
+            [255, 96, 38],
+            [142, 76, 0],
+            [33, 0, 127],
+            [0, 0, 0],
+            [183, 183, 183],
+        ],
+        dtype=np.float,
+    )
+    / 255
+)
+
+
+def plot_embeddings(embeddings, num_utter_per_speaker):
+    embeddings = embeddings[: 10 * num_utter_per_speaker]
+    model = umap.UMAP()
+    projection = model.fit_transform(embeddings)
+    num_speakers = embeddings.shape[0] // num_utter_per_speaker
+    ground_truth = np.repeat(np.arange(num_speakers), num_utter_per_speaker)
+    colors = [colormap[i] for i in ground_truth]
+
+    fig, ax = plt.subplots(figsize=(16, 10))
+    _ = ax.scatter(projection[:, 0], projection[:, 1], c=colors)
+    plt.gca().set_aspect("equal", "datalim")
+    plt.title("UMAP projection")
+    plt.tight_layout()
+    plt.savefig("umap")
+    return fig
diff --git a/utils/__init__.py b/utils/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/utils/audio.py b/utils/audio.py
new file mode 100644
index 0000000..99e00b6
--- /dev/null
+++ b/utils/audio.py
@@ -0,0 +1,822 @@
+from typing import Dict, Tuple
+
+import librosa
+import numpy as np
+import pyworld as pw
+import scipy.io.wavfile
+import scipy.signal
+import soundfile as sf
+import torch
+from torch import nn
+
+class StandardScaler:
+    """StandardScaler for mean-scale normalization with the given mean and scale values."""
+
+    def __init__(self, mean: np.ndarray = None, scale: np.ndarray = None) -> None:
+        self.mean_ = mean
+        self.scale_ = scale
+
+    def set_stats(self, mean, scale):
+        self.mean_ = mean
+        self.scale_ = scale
+
+    def reset_stats(self):
+        delattr(self, "mean_")
+        delattr(self, "scale_")
+
+    def transform(self, X):
+        X = np.asarray(X)
+        X -= self.mean_
+        X /= self.scale_
+        return X
+
+    def inverse_transform(self, X):
+        X = np.asarray(X)
+        X *= self.scale_
+        X += self.mean_
+        return X
+
+class TorchSTFT(nn.Module):  # pylint: disable=abstract-method
+    """Some of the audio processing funtions using Torch for faster batch processing.
+
+    TODO: Merge this with audio.py
+    """
+
+    def __init__(
+        self,
+        n_fft,
+        hop_length,
+        win_length,
+        pad_wav=False,
+        window="hann_window",
+        sample_rate=None,
+        mel_fmin=0,
+        mel_fmax=None,
+        n_mels=80,
+        use_mel=False,
+        do_amp_to_db=False,
+        spec_gain=1.0,
+    ):
+        super().__init__()
+        self.n_fft = n_fft
+        self.hop_length = hop_length
+        self.win_length = win_length
+        self.pad_wav = pad_wav
+        self.sample_rate = sample_rate
+        self.mel_fmin = mel_fmin
+        self.mel_fmax = mel_fmax
+        self.n_mels = n_mels
+        self.use_mel = use_mel
+        self.do_amp_to_db = do_amp_to_db
+        self.spec_gain = spec_gain
+        self.window = nn.Parameter(getattr(torch, window)(win_length), requires_grad=False)
+        self.mel_basis = None
+        if use_mel:
+            self._build_mel_basis()
+
+    def __call__(self, x):
+        """Compute spectrogram frames by torch based stft.
+
+        Args:
+            x (Tensor): input waveform
+
+        Returns:
+            Tensor: spectrogram frames.
+
+        Shapes:
+            x: [B x T] or [:math:`[B, 1, T]`]
+        """
+        if x.ndim == 2:
+            x = x.unsqueeze(1)
+        if self.pad_wav:
+            padding = int((self.n_fft - self.hop_length) / 2)
+            x = torch.nn.functional.pad(x, (padding, padding), mode="reflect")
+        # B x D x T x 2
+        o = torch.stft(
+            x.squeeze(1),
+            self.n_fft,
+            self.hop_length,
+            self.win_length,
+            self.window,
+            center=True,
+            pad_mode="reflect",  # compatible with audio.py
+            normalized=False,
+            onesided=True,
+            return_complex=False,
+        )
+        M = o[:, :, :, 0]
+        P = o[:, :, :, 1]
+        S = torch.sqrt(torch.clamp(M ** 2 + P ** 2, min=1e-8))
+        if self.use_mel:
+            S = torch.matmul(self.mel_basis.to(x), S)
+        if self.do_amp_to_db:
+            S = self._amp_to_db(S, spec_gain=self.spec_gain)
+        return S
+
+    def _build_mel_basis(self):
+        mel_basis = librosa.filters.mel(
+            self.sample_rate, self.n_fft, n_mels=self.n_mels, fmin=self.mel_fmin, fmax=self.mel_fmax
+        )
+        self.mel_basis = torch.from_numpy(mel_basis).float()
+
+    @staticmethod
+    def _amp_to_db(x, spec_gain=1.0):
+        return torch.log(torch.clamp(x, min=1e-5) * spec_gain)
+
+    @staticmethod
+    def _db_to_amp(x, spec_gain=1.0):
+        return torch.exp(x) / spec_gain
+
+
+# pylint: disable=too-many-public-methods
+class AudioProcessor(object):
+    """Audio Processor for TTS used by all the data pipelines.
+
+    Note:
+        All the class arguments are set to default values to enable a flexible initialization
+        of the class with the model config. They are not meaningful for all the arguments.
+
+    Args:
+        sample_rate (int, optional):
+            target audio sampling rate. Defaults to None.
+
+        resample (bool, optional):
+            enable/disable resampling of the audio clips when the target sampling rate does not match the original sampling rate. Defaults to False.
+
+        num_mels (int, optional):
+            number of melspectrogram dimensions. Defaults to None.
+
+        log_func (int, optional):
+            log exponent used for converting spectrogram aplitude to DB.
+
+        min_level_db (int, optional):
+            minimum db threshold for the computed melspectrograms. Defaults to None.
+
+        frame_shift_ms (int, optional):
+            milliseconds of frames between STFT columns. Defaults to None.
+
+        frame_length_ms (int, optional):
+            milliseconds of STFT window length. Defaults to None.
+
+        hop_length (int, optional):
+            number of frames between STFT columns. Used if ```frame_shift_ms``` is None. Defaults to None.
+
+        win_length (int, optional):
+            STFT window length. Used if ```frame_length_ms``` is None. Defaults to None.
+
+        ref_level_db (int, optional):
+            reference DB level to avoid background noise. In general <20DB corresponds to the air noise. Defaults to None.
+
+        fft_size (int, optional):
+            FFT window size for STFT. Defaults to 1024.
+
+        power (int, optional):
+            Exponent value applied to the spectrogram before GriffinLim. Defaults to None.
+
+        preemphasis (float, optional):
+            Preemphasis coefficient. Preemphasis is disabled if == 0.0. Defaults to 0.0.
+
+        signal_norm (bool, optional):
+            enable/disable signal normalization. Defaults to None.
+
+        symmetric_norm (bool, optional):
+            enable/disable symmetric normalization. If set True normalization is performed in the range [-k, k] else [0, k], Defaults to None.
+
+        max_norm (float, optional):
+            ```k``` defining the normalization range. Defaults to None.
+
+        mel_fmin (int, optional):
+            minimum filter frequency for computing melspectrograms. Defaults to None.
+
+        mel_fmax (int, optional):
+            maximum filter frequency for computing melspectrograms.. Defaults to None.
+
+        spec_gain (int, optional):
+            gain applied when converting amplitude to DB. Defaults to 20.
+
+        stft_pad_mode (str, optional):
+            Padding mode for STFT. Defaults to 'reflect'.
+
+        clip_norm (bool, optional):
+            enable/disable clipping the our of range values in the normalized audio signal. Defaults to True.
+
+        griffin_lim_iters (int, optional):
+            Number of GriffinLim iterations. Defaults to None.
+
+        do_trim_silence (bool, optional):
+            enable/disable silence trimming when loading the audio signal. Defaults to False.
+
+        trim_db (int, optional):
+            DB threshold used for silence trimming. Defaults to 60.
+
+        do_sound_norm (bool, optional):
+            enable/disable signal normalization. Defaults to False.
+
+        do_amp_to_db_linear (bool, optional):
+            enable/disable amplitude to dB conversion of linear spectrograms. Defaults to True.
+
+        do_amp_to_db_mel (bool, optional):
+            enable/disable amplitude to dB conversion of mel spectrograms. Defaults to True.
+
+        stats_path (str, optional):
+            Path to the computed stats file. Defaults to None.
+
+        verbose (bool, optional):
+            enable/disable logging. Defaults to True.
+
+    """
+
+    def __init__(
+        self,
+        sample_rate=None,
+        resample=False,
+        num_mels=None,
+        log_func="np.log10",
+        min_level_db=None,
+        frame_shift_ms=None,
+        frame_length_ms=None,
+        hop_length=None,
+        win_length=None,
+        ref_level_db=None,
+        fft_size=1024,
+        power=None,
+        preemphasis=0.0,
+        signal_norm=None,
+        symmetric_norm=None,
+        max_norm=None,
+        mel_fmin=None,
+        mel_fmax=None,
+        spec_gain=20,
+        stft_pad_mode="reflect",
+        clip_norm=True,
+        griffin_lim_iters=None,
+        do_trim_silence=False,
+        trim_db=60,
+        do_sound_norm=False,
+        do_amp_to_db_linear=True,
+        do_amp_to_db_mel=True,
+        stats_path=None,
+        verbose=True,
+        **_,
+    ):
+
+        # setup class attributed
+        self.sample_rate = sample_rate
+        self.resample = resample
+        self.num_mels = num_mels
+        self.log_func = log_func
+        self.min_level_db = min_level_db or 0
+        self.frame_shift_ms = frame_shift_ms
+        self.frame_length_ms = frame_length_ms
+        self.ref_level_db = ref_level_db
+        self.fft_size = fft_size
+        self.power = power
+        self.preemphasis = preemphasis
+        self.griffin_lim_iters = griffin_lim_iters
+        self.signal_norm = signal_norm
+        self.symmetric_norm = symmetric_norm
+        self.mel_fmin = mel_fmin or 0
+        self.mel_fmax = mel_fmax
+        self.spec_gain = float(spec_gain)
+        self.stft_pad_mode = stft_pad_mode
+        self.max_norm = 1.0 if max_norm is None else float(max_norm)
+        self.clip_norm = clip_norm
+        self.do_trim_silence = do_trim_silence
+        self.trim_db = trim_db
+        self.do_sound_norm = do_sound_norm
+        self.do_amp_to_db_linear = do_amp_to_db_linear
+        self.do_amp_to_db_mel = do_amp_to_db_mel
+        self.stats_path = stats_path
+        # setup exp_func for db to amp conversion
+        if log_func == "np.log":
+            self.base = np.e
+        elif log_func == "np.log10":
+            self.base = 10
+        else:
+            raise ValueError(" [!] unknown `log_func` value.")
+        # setup stft parameters
+        if hop_length is None:
+            # compute stft parameters from given time values
+            self.hop_length, self.win_length = self._stft_parameters()
+        else:
+            # use stft parameters from config file
+            self.hop_length = hop_length
+            self.win_length = win_length
+        assert min_level_db != 0.0, " [!] min_level_db is 0"
+        assert self.win_length <= self.fft_size, " [!] win_length cannot be larger than fft_size"
+        members = vars(self)
+        if verbose:
+            print(" > Setting up Audio Processor...")
+            for key, value in members.items():
+                print(" | > {}:{}".format(key, value))
+        # create spectrogram utils
+        self.mel_basis = self._build_mel_basis()
+        self.inv_mel_basis = np.linalg.pinv(self._build_mel_basis())
+        # setup scaler
+        if stats_path and signal_norm:
+            mel_mean, mel_std, linear_mean, linear_std, _ = self.load_stats(stats_path)
+            self.setup_scaler(mel_mean, mel_std, linear_mean, linear_std)
+            self.signal_norm = True
+            self.max_norm = None
+            self.clip_norm = None
+            self.symmetric_norm = None
+
+    ### setting up the parameters ###
+    def _build_mel_basis(
+        self,
+    ) -> np.ndarray:
+        """Build melspectrogram basis.
+
+        Returns:
+            np.ndarray: melspectrogram basis.
+        """
+        if self.mel_fmax is not None:
+            assert self.mel_fmax <= self.sample_rate // 2
+        return librosa.filters.mel(
+            self.sample_rate, self.fft_size, n_mels=self.num_mels, fmin=self.mel_fmin, fmax=self.mel_fmax
+        )
+
+    def _stft_parameters(
+        self,
+    ) -> Tuple[int, int]:
+        """Compute the real STFT parameters from the time values.
+
+        Returns:
+            Tuple[int, int]: hop length and window length for STFT.
+        """
+        factor = self.frame_length_ms / self.frame_shift_ms
+        assert (factor).is_integer(), " [!] frame_shift_ms should divide frame_length_ms"
+        hop_length = int(self.frame_shift_ms / 1000.0 * self.sample_rate)
+        win_length = int(hop_length * factor)
+        return hop_length, win_length
+
+    ### normalization ###
+    def normalize(self, S: np.ndarray) -> np.ndarray:
+        """Normalize values into `[0, self.max_norm]` or `[-self.max_norm, self.max_norm]`
+
+        Args:
+            S (np.ndarray): Spectrogram to normalize.
+
+        Raises:
+            RuntimeError: Mean and variance is computed from incompatible parameters.
+
+        Returns:
+            np.ndarray: Normalized spectrogram.
+        """
+        # pylint: disable=no-else-return
+        S = S.copy()
+        if self.signal_norm:
+            # mean-var scaling
+            if hasattr(self, "mel_scaler"):
+                if S.shape[0] == self.num_mels:
+                    return self.mel_scaler.transform(S.T).T
+                elif S.shape[0] == self.fft_size / 2:
+                    return self.linear_scaler.transform(S.T).T
+                else:
+                    raise RuntimeError(" [!] Mean-Var stats does not match the given feature dimensions.")
+            # range normalization
+            S -= self.ref_level_db  # discard certain range of DB assuming it is air noise
+            S_norm = (S - self.min_level_db) / (-self.min_level_db)
+            if self.symmetric_norm:
+                S_norm = ((2 * self.max_norm) * S_norm) - self.max_norm
+                if self.clip_norm:
+                    S_norm = np.clip(
+                        S_norm, -self.max_norm, self.max_norm  # pylint: disable=invalid-unary-operand-type
+                    )
+                return S_norm
+            else:
+                S_norm = self.max_norm * S_norm
+                if self.clip_norm:
+                    S_norm = np.clip(S_norm, 0, self.max_norm)
+                return S_norm
+        else:
+            return S
+
+    def denormalize(self, S: np.ndarray) -> np.ndarray:
+        """Denormalize spectrogram values.
+
+        Args:
+            S (np.ndarray): Spectrogram to denormalize.
+
+        Raises:
+            RuntimeError: Mean and variance are incompatible.
+
+        Returns:
+            np.ndarray: Denormalized spectrogram.
+        """
+        # pylint: disable=no-else-return
+        S_denorm = S.copy()
+        if self.signal_norm:
+            # mean-var scaling
+            if hasattr(self, "mel_scaler"):
+                if S_denorm.shape[0] == self.num_mels:
+                    return self.mel_scaler.inverse_transform(S_denorm.T).T
+                elif S_denorm.shape[0] == self.fft_size / 2:
+                    return self.linear_scaler.inverse_transform(S_denorm.T).T
+                else:
+                    raise RuntimeError(" [!] Mean-Var stats does not match the given feature dimensions.")
+            if self.symmetric_norm:
+                if self.clip_norm:
+                    S_denorm = np.clip(
+                        S_denorm, -self.max_norm, self.max_norm  # pylint: disable=invalid-unary-operand-type
+                    )
+                S_denorm = ((S_denorm + self.max_norm) * -self.min_level_db / (2 * self.max_norm)) + self.min_level_db
+                return S_denorm + self.ref_level_db
+            else:
+                if self.clip_norm:
+                    S_denorm = np.clip(S_denorm, 0, self.max_norm)
+                S_denorm = (S_denorm * -self.min_level_db / self.max_norm) + self.min_level_db
+                return S_denorm + self.ref_level_db
+        else:
+            return S_denorm
+
+    ### Mean-STD scaling ###
+    def load_stats(self, stats_path: str) -> Tuple[np.array, np.array, np.array, np.array, Dict]:
+        """Loading mean and variance statistics from a `npy` file.
+
+        Args:
+            stats_path (str): Path to the `npy` file containing
+
+        Returns:
+            Tuple[np.array, np.array, np.array, np.array, Dict]: loaded statistics and the config used to
+                compute them.
+        """
+        stats = np.load(stats_path, allow_pickle=True).item()  # pylint: disable=unexpected-keyword-arg
+        mel_mean = stats["mel_mean"]
+        mel_std = stats["mel_std"]
+        linear_mean = stats["linear_mean"]
+        linear_std = stats["linear_std"]
+        stats_config = stats["audio_config"]
+        # check all audio parameters used for computing stats
+        skip_parameters = ["griffin_lim_iters", "stats_path", "do_trim_silence", "ref_level_db", "power"]
+        for key in stats_config.keys():
+            if key in skip_parameters:
+                continue
+            if key not in ["sample_rate", "trim_db"]:
+                assert (
+                    stats_config[key] == self.__dict__[key]
+                ), f" [!] Audio param {key} does not match the value used for computing mean-var stats. {stats_config[key]} vs {self.__dict__[key]}"
+        return mel_mean, mel_std, linear_mean, linear_std, stats_config
+
+    # pylint: disable=attribute-defined-outside-init
+    def setup_scaler(
+        self, mel_mean: np.ndarray, mel_std: np.ndarray, linear_mean: np.ndarray, linear_std: np.ndarray
+    ) -> None:
+        """Initialize scaler objects used in mean-std normalization.
+
+        Args:
+            mel_mean (np.ndarray): Mean for melspectrograms.
+            mel_std (np.ndarray): STD for melspectrograms.
+            linear_mean (np.ndarray): Mean for full scale spectrograms.
+            linear_std (np.ndarray): STD for full scale spectrograms.
+        """
+        self.mel_scaler = StandardScaler()
+        self.mel_scaler.set_stats(mel_mean, mel_std)
+        self.linear_scaler = StandardScaler()
+        self.linear_scaler.set_stats(linear_mean, linear_std)
+
+    ### DB and AMP conversion ###
+    # pylint: disable=no-self-use
+    def _amp_to_db(self, x: np.ndarray) -> np.ndarray:
+        """Convert amplitude values to decibels.
+
+        Args:
+            x (np.ndarray): Amplitude spectrogram.
+
+        Returns:
+            np.ndarray: Decibels spectrogram.
+        """
+        return self.spec_gain * _log(np.maximum(1e-5, x), self.base)
+
+    # pylint: disable=no-self-use
+    def _db_to_amp(self, x: np.ndarray) -> np.ndarray:
+        """Convert decibels spectrogram to amplitude spectrogram.
+
+        Args:
+            x (np.ndarray): Decibels spectrogram.
+
+        Returns:
+            np.ndarray: Amplitude spectrogram.
+        """
+        return _exp(x / self.spec_gain, self.base)
+
+    ### Preemphasis ###
+    def apply_preemphasis(self, x: np.ndarray) -> np.ndarray:
+        """Apply pre-emphasis to the audio signal. Useful to reduce the correlation between neighbouring signal values.
+
+        Args:
+            x (np.ndarray): Audio signal.
+
+        Raises:
+            RuntimeError: Preemphasis coeff is set to 0.
+
+        Returns:
+            np.ndarray: Decorrelated audio signal.
+        """
+        if self.preemphasis == 0:
+            raise RuntimeError(" [!] Preemphasis is set 0.0.")
+        return scipy.signal.lfilter([1, -self.preemphasis], [1], x)
+
+    def apply_inv_preemphasis(self, x: np.ndarray) -> np.ndarray:
+        """Reverse pre-emphasis."""
+        if self.preemphasis == 0:
+            raise RuntimeError(" [!] Preemphasis is set 0.0.")
+        return scipy.signal.lfilter([1], [1, -self.preemphasis], x)
+
+    ### SPECTROGRAMs ###
+    def _linear_to_mel(self, spectrogram: np.ndarray) -> np.ndarray:
+        """Project a full scale spectrogram to a melspectrogram.
+
+        Args:
+            spectrogram (np.ndarray): Full scale spectrogram.
+
+        Returns:
+            np.ndarray: Melspectrogram
+        """
+        return np.dot(self.mel_basis, spectrogram)
+
+    def _mel_to_linear(self, mel_spec: np.ndarray) -> np.ndarray:
+        """Convert a melspectrogram to full scale spectrogram."""
+        return np.maximum(1e-10, np.dot(self.inv_mel_basis, mel_spec))
+
+    def spectrogram(self, y: np.ndarray) -> np.ndarray:
+        """Compute a spectrogram from a waveform.
+
+        Args:
+            y (np.ndarray): Waveform.
+
+        Returns:
+            np.ndarray: Spectrogram.
+        """
+        if self.preemphasis != 0:
+            D = self._stft(self.apply_preemphasis(y))
+        else:
+            D = self._stft(y)
+        if self.do_amp_to_db_linear:
+            S = self._amp_to_db(np.abs(D))
+        else:
+            S = np.abs(D)
+        return self.normalize(S).astype(np.float32)
+
+    def melspectrogram(self, y: np.ndarray) -> np.ndarray:
+        """Compute a melspectrogram from a waveform."""
+        if self.preemphasis != 0:
+            D = self._stft(self.apply_preemphasis(y))
+        else:
+            D = self._stft(y)
+        if self.do_amp_to_db_mel:
+            S = self._amp_to_db(self._linear_to_mel(np.abs(D)))
+        else:
+            S = self._linear_to_mel(np.abs(D))
+        return self.normalize(S).astype(np.float32)
+
+    def inv_spectrogram(self, spectrogram: np.ndarray) -> np.ndarray:
+        """Convert a spectrogram to a waveform using Griffi-Lim vocoder."""
+        S = self.denormalize(spectrogram)
+        S = self._db_to_amp(S)
+        # Reconstruct phase
+        if self.preemphasis != 0:
+            return self.apply_inv_preemphasis(self._griffin_lim(S ** self.power))
+        return self._griffin_lim(S ** self.power)
+
+    def inv_melspectrogram(self, mel_spectrogram: np.ndarray) -> np.ndarray:
+        """Convert a melspectrogram to a waveform using Griffi-Lim vocoder."""
+        D = self.denormalize(mel_spectrogram)
+        S = self._db_to_amp(D)
+        S = self._mel_to_linear(S)  # Convert back to linear
+        if self.preemphasis != 0:
+            return self.apply_inv_preemphasis(self._griffin_lim(S ** self.power))
+        return self._griffin_lim(S ** self.power)
+
+    def out_linear_to_mel(self, linear_spec: np.ndarray) -> np.ndarray:
+        """Convert a full scale linear spectrogram output of a network to a melspectrogram.
+
+        Args:
+            linear_spec (np.ndarray): Normalized full scale linear spectrogram.
+
+        Returns:
+            np.ndarray: Normalized melspectrogram.
+        """
+        S = self.denormalize(linear_spec)
+        S = self._db_to_amp(S)
+        S = self._linear_to_mel(np.abs(S))
+        S = self._amp_to_db(S)
+        mel = self.normalize(S)
+        return mel
+
+    ### STFT and ISTFT ###
+    def _stft(self, y: np.ndarray) -> np.ndarray:
+        """Librosa STFT wrapper.
+
+        Args:
+            y (np.ndarray): Audio signal.
+
+        Returns:
+            np.ndarray: Complex number array.
+        """
+        return librosa.stft(
+            y=y,
+            n_fft=self.fft_size,
+            hop_length=self.hop_length,
+            win_length=self.win_length,
+            pad_mode=self.stft_pad_mode,
+            window="hann",
+            center=True,
+        )
+
+    def _istft(self, y: np.ndarray) -> np.ndarray:
+        """Librosa iSTFT wrapper."""
+        return librosa.istft(y, hop_length=self.hop_length, win_length=self.win_length)
+
+    def _griffin_lim(self, S):
+        angles = np.exp(2j * np.pi * np.random.rand(*S.shape))
+        S_complex = np.abs(S).astype(np.complex)
+        y = self._istft(S_complex * angles)
+        if not np.isfinite(y).all():
+            print(" [!] Waveform is not finite everywhere. Skipping the GL.")
+            return np.array([0.0])
+        for _ in range(self.griffin_lim_iters):
+            angles = np.exp(1j * np.angle(self._stft(y)))
+            y = self._istft(S_complex * angles)
+        return y
+
+    def compute_stft_paddings(self, x, pad_sides=1):
+        """Compute paddings used by Librosa's STFT. Compute right padding (final frame) or both sides padding
+        (first and final frames)"""
+        assert pad_sides in (1, 2)
+        pad = (x.shape[0] // self.hop_length + 1) * self.hop_length - x.shape[0]
+        if pad_sides == 1:
+            return 0, pad
+        return pad // 2, pad // 2 + pad % 2
+
+    def compute_f0(self, x: np.ndarray) -> np.ndarray:
+        """Compute pitch (f0) of a waveform using the same parameters used for computing melspectrogram.
+
+        Args:
+            x (np.ndarray): Waveform.
+
+        Returns:
+            np.ndarray: Pitch.
+
+        Examples:
+            >>> WAV_FILE = filename = librosa.util.example_audio_file()
+            >>> from TTS.config import BaseAudioConfig
+            >>> from TTS.utils.audio import AudioProcessor
+            >>> conf = BaseAudioConfig(mel_fmax=8000)
+            >>> ap = AudioProcessor(**conf)
+            >>> wav = ap.load_wav(WAV_FILE, sr=22050)[:5 * 22050]
+            >>> pitch = ap.compute_f0(wav)
+        """
+        f0, t = pw.dio(
+            x.astype(np.double),
+            fs=self.sample_rate,
+            f0_ceil=self.mel_fmax,
+            frame_period=1000 * self.hop_length / self.sample_rate,
+        )
+        f0 = pw.stonemask(x.astype(np.double), f0, t, self.sample_rate)
+        # pad = int((self.win_length / self.hop_length) / 2)
+        # f0 = [0.0] * pad + f0 + [0.0] * pad
+        # f0 = np.pad(f0, (pad, pad), mode="constant", constant_values=0)
+        # f0 = np.array(f0, dtype=np.float32)
+
+        # f01, _, _ = librosa.pyin(
+        #     x,
+        #     fmin=65 if self.mel_fmin == 0 else self.mel_fmin,
+        #     fmax=self.mel_fmax,
+        #     frame_length=self.win_length,
+        #     sr=self.sample_rate,
+        #     fill_na=0.0,
+        # )
+
+        # spec = self.melspectrogram(x)
+        return f0
+
+    ### Audio Processing ###
+    def find_endpoint(self, wav: np.ndarray, threshold_db=-40, min_silence_sec=0.8) -> int:
+        """Find the last point without silence at the end of a audio signal.
+
+        Args:
+            wav (np.ndarray): Audio signal.
+            threshold_db (int, optional): Silence threshold in decibels. Defaults to -40.
+            min_silence_sec (float, optional): Ignore silences that are shorter then this in secs. Defaults to 0.8.
+
+        Returns:
+            int: Last point without silence.
+        """
+        window_length = int(self.sample_rate * min_silence_sec)
+        hop_length = int(window_length / 4)
+        threshold = self._db_to_amp(threshold_db)
+        for x in range(hop_length, len(wav) - window_length, hop_length):
+            if np.max(wav[x : x + window_length]) < threshold:
+                return x + hop_length
+        return len(wav)
+
+    def trim_silence(self, wav):
+        """Trim silent parts with a threshold and 0.01 sec margin"""
+        margin = int(self.sample_rate * 0.01)
+        wav = wav[margin:-margin]
+        return librosa.effects.trim(wav, top_db=self.trim_db, frame_length=self.win_length, hop_length=self.hop_length)[
+            0
+        ]
+
+    @staticmethod
+    def sound_norm(x: np.ndarray) -> np.ndarray:
+        """Normalize the volume of an audio signal.
+
+        Args:
+            x (np.ndarray): Raw waveform.
+
+        Returns:
+            np.ndarray: Volume normalized waveform.
+        """
+        return x / abs(x).max() * 0.95
+
+    ### save and load ###
+    def load_wav(self, filename: str, sr: int = None) -> np.ndarray:
+        """Read a wav file using Librosa and optionally resample, silence trim, volume normalize.
+
+        Args:
+            filename (str): Path to the wav file.
+            sr (int, optional): Sampling rate for resampling. Defaults to None.
+
+        Returns:
+            np.ndarray: Loaded waveform.
+        """
+        if self.resample:
+            x, sr = librosa.load(filename, sr=self.sample_rate)
+        elif sr is None:
+            x, sr = sf.read(filename)
+            assert self.sample_rate == sr, "%s vs %s" % (self.sample_rate, sr)
+        else:
+            x, sr = librosa.load(filename, sr=sr)
+        if self.do_trim_silence:
+            try:
+                x = self.trim_silence(x)
+            except ValueError:
+                print(f" [!] File cannot be trimmed for silence - {filename}")
+        if self.do_sound_norm:
+            x = self.sound_norm(x)
+        return x
+
+    def save_wav(self, wav: np.ndarray, path: str, sr: int = None) -> None:
+        """Save a waveform to a file using Scipy.
+
+        Args:
+            wav (np.ndarray): Waveform to save.
+            path (str): Path to a output file.
+            sr (int, optional): Sampling rate used for saving to the file. Defaults to None.
+        """
+        wav_norm = wav * (32767 / max(0.01, np.max(np.abs(wav))))
+        scipy.io.wavfile.write(path, sr if sr else self.sample_rate, wav_norm.astype(np.int16))
+
+    @staticmethod
+    def mulaw_encode(wav: np.ndarray, qc: int) -> np.ndarray:
+        mu = 2 ** qc - 1
+        # wav_abs = np.minimum(np.abs(wav), 1.0)
+        signal = np.sign(wav) * np.log(1 + mu * np.abs(wav)) / np.log(1.0 + mu)
+        # Quantize signal to the specified number of levels.
+        signal = (signal + 1) / 2 * mu + 0.5
+        return np.floor(
+            signal,
+        )
+
+    @staticmethod
+    def mulaw_decode(wav, qc):
+        """Recovers waveform from quantized values."""
+        mu = 2 ** qc - 1
+        x = np.sign(wav) / mu * ((1 + mu) ** np.abs(wav) - 1)
+        return x
+
+    @staticmethod
+    def encode_16bits(x):
+        return np.clip(x * 2 ** 15, -(2 ** 15), 2 ** 15 - 1).astype(np.int16)
+
+    @staticmethod
+    def quantize(x: np.ndarray, bits: int) -> np.ndarray:
+        """Quantize a waveform to a given number of bits.
+
+        Args:
+            x (np.ndarray): Waveform to quantize. Must be normalized into the range `[-1, 1]`.
+            bits (int): Number of quantization bits.
+
+        Returns:
+            np.ndarray: Quantized waveform.
+        """
+        return (x + 1.0) * (2 ** bits - 1) / 2
+
+    @staticmethod
+    def dequantize(x, bits):
+        """Dequantize a waveform from the given number of bits."""
+        return 2 * x / (2 ** bits - 1) - 1
+
+
+def _log(x, base):
+    if base == 10:
+        return np.log10(x)
+    return np.log(x)
+
+
+def _exp(x, base):
+    if base == 10:
+        return np.power(10, x)
+    return np.exp(x)
diff --git a/utils/io.py b/utils/io.py
new file mode 100644
index 0000000..e4a068c
--- /dev/null
+++ b/utils/io.py
@@ -0,0 +1,198 @@
+import datetime
+import json
+import os
+import pickle as pickle_tts
+import shutil
+from typing import Any, Callable, Dict, Union
+
+import fsspec
+import torch
+from coqpit import Coqpit
+
+
+class RenamingUnpickler(pickle_tts.Unpickler):
+    """Overload default pickler to solve module renaming problem"""
+
+    def find_class(self, module, name):
+        return super().find_class(module.replace("mozilla_voice_tts", "TTS"), name)
+
+
+class AttrDict(dict):
+    """A custom dict which converts dict keys
+    to class attributes"""
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.__dict__ = self
+
+
+def copy_model_files(config: Coqpit, out_path, new_fields):
+    """Copy config.json and other model files to training folder and add
+    new fields.
+
+    Args:
+        config (Coqpit): Coqpit config defining the training run.
+        out_path (str): output path to copy the file.
+        new_fields (dict): new fileds to be added or edited
+            in the config file.
+    """
+    copy_config_path = os.path.join(out_path, "config.json")
+    # add extra information fields
+    config.update(new_fields, allow_new=True)
+    # TODO: Revert to config.save_json() once Coqpit supports arbitrary paths.
+    with fsspec.open(copy_config_path, "w", encoding="utf8") as f:
+        json.dump(config.to_dict(), f, indent=4)
+
+    # copy model stats file if available
+    if config.audio.stats_path is not None:
+        copy_stats_path = os.path.join(out_path, "scale_stats.npy")
+        filesystem = fsspec.get_mapper(copy_stats_path).fs
+        if not filesystem.exists(copy_stats_path):
+            with fsspec.open(config.audio.stats_path, "rb") as source_file:
+                with fsspec.open(copy_stats_path, "wb") as target_file:
+                    shutil.copyfileobj(source_file, target_file)
+
+
+def load_fsspec(
+    path: str,
+    map_location: Union[str, Callable, torch.device, Dict[Union[str, torch.device], Union[str, torch.device]]] = None,
+    **kwargs,
+) -> Any:
+    """Like torch.load but can load from other locations (e.g. s3:// , gs://).
+
+    Args:
+        path: Any path or url supported by fsspec.
+        map_location: torch.device or str.
+        **kwargs: Keyword arguments forwarded to torch.load.
+
+    Returns:
+        Object stored in path.
+    """
+    with fsspec.open(path, "rb") as f:
+        return torch.load(f, map_location=map_location, **kwargs)
+
+
+def load_checkpoint(model, checkpoint_path, use_cuda=False, eval=False):  # pylint: disable=redefined-builtin
+    try:
+        state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"))
+    except ModuleNotFoundError:
+        pickle_tts.Unpickler = RenamingUnpickler
+        state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), pickle_module=pickle_tts)
+    model.load_state_dict(state["model"])
+    if use_cuda:
+        model.cuda()
+    if eval:
+        model.eval()
+    return model, state
+
+
+def save_fsspec(state: Any, path: str, **kwargs):
+    """Like torch.save but can save to other locations (e.g. s3:// , gs://).
+
+    Args:
+        state: State object to save
+        path: Any path or url supported by fsspec.
+        **kwargs: Keyword arguments forwarded to torch.save.
+    """
+    with fsspec.open(path, "wb") as f:
+        torch.save(state, f, **kwargs)
+
+
+def save_model(config, model, optimizer, scaler, current_step, epoch, output_path, **kwargs):
+    if hasattr(model, "module"):
+        model_state = model.module.state_dict()
+    else:
+        model_state = model.state_dict()
+    if isinstance(optimizer, list):
+        optimizer_state = [optim.state_dict() for optim in optimizer]
+    else:
+        optimizer_state = optimizer.state_dict() if optimizer is not None else None
+
+    if isinstance(scaler, list):
+        scaler_state = [s.state_dict() for s in scaler]
+    else:
+        scaler_state = scaler.state_dict() if scaler is not None else None
+
+    if isinstance(config, Coqpit):
+        config = config.to_dict()
+
+    state = {
+        "config": config,
+        "model": model_state,
+        "optimizer": optimizer_state,
+        "scaler": scaler_state,
+        "step": current_step,
+        "epoch": epoch,
+        "date": datetime.date.today().strftime("%B %d, %Y"),
+    }
+    state.update(kwargs)
+    save_fsspec(state, output_path)
+
+
+def save_checkpoint(
+    config,
+    model,
+    optimizer,
+    scaler,
+    current_step,
+    epoch,
+    output_folder,
+    **kwargs,
+):
+    file_name = "checkpoint_{}.pth.tar".format(current_step)
+    checkpoint_path = os.path.join(output_folder, file_name)
+    print("\n > CHECKPOINT : {}".format(checkpoint_path))
+    save_model(
+        config,
+        model,
+        optimizer,
+        scaler,
+        current_step,
+        epoch,
+        checkpoint_path,
+        **kwargs,
+    )
+
+
+def save_best_model(
+    current_loss,
+    best_loss,
+    config,
+    model,
+    optimizer,
+    scaler,
+    current_step,
+    epoch,
+    out_path,
+    keep_all_best=False,
+    keep_after=10000,
+    **kwargs,
+):
+    if current_loss < best_loss:
+        best_model_name = f"best_model_{current_step}.pth.tar"
+        checkpoint_path = os.path.join(out_path, best_model_name)
+        print(" > BEST MODEL : {}".format(checkpoint_path))
+        save_model(
+            config,
+            model,
+            optimizer,
+            scaler,
+            current_step,
+            epoch,
+            checkpoint_path,
+            model_loss=current_loss,
+            **kwargs,
+        )
+        fs = fsspec.get_mapper(out_path).fs
+        # only delete previous if current is saved successfully
+        if not keep_all_best or (current_step < keep_after):
+            model_names = fs.glob(os.path.join(out_path, "best_model*.pth.tar"))
+            for model_name in model_names:
+                if os.path.basename(model_name) != best_model_name:
+                    fs.rm(model_name)
+        # create a shortcut which always points to the currently best model
+        shortcut_name = "best_model.pth.tar"
+        shortcut_path = os.path.join(out_path, shortcut_name)
+        fs.copy(checkpoint_path, shortcut_path)
+        best_loss = current_loss
+    return best_loss
diff --git a/vi_speaker_batch.py b/vi_speaker_batch.py
new file mode 100644
index 0000000..503c739
--- /dev/null
+++ b/vi_speaker_batch.py
@@ -0,0 +1,88 @@
+import os
+import re
+import json
+import fsspec
+import torch
+import numpy as np
+import argparse
+
+from tqdm import tqdm
+from argparse import RawTextHelpFormatter
+from speaker_encoder.models.lstm import LSTMSpeakerEncoder
+from speaker_encoder.speaker_encoder_config import SpeakerEncoderConfig
+
+from utils.audio import AudioProcessor
+from vi_speaker_single import read_json
+
+
+def get_spk_wavs(dataset_path, output_path):
+    wav_files = []
+    os.makedirs(f"./{output_path}")
+    for spks in os.listdir(dataset_path):
+        if os.path.isdir(f"./{dataset_path}/{spks}"):
+            os.makedirs(f"./{output_path}/{spks}")
+            for file in os.listdir(f"./{dataset_path}/{spks}"):
+                if file.endswith(".wav"):
+                    wav_files.append(f"./{dataset_path}/{spks}/{file}")
+        elif spks.endswith(".wav"):
+            wav_files.append(f"./{dataset_path}/{spks}")
+    return wav_files
+
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser(
+        description="""Compute embedding vectors for each wav file in a dataset.""",
+        formatter_class=RawTextHelpFormatter,
+    )
+    parser.add_argument("model_path", type=str, help="Path to model checkpoint file.")
+    parser.add_argument("config_path", type=str, help="Path to model config file.")
+    parser.add_argument("dataset_path", type=str, help="Path to dataset waves.")
+    parser.add_argument(
+        "output_path", type=str, help="path for output speaker/speaker_wavs.npy."
+    )
+    parser.add_argument("--use_cuda", type=bool, help="flag to set cuda.", default=True)
+    parser.add_argument("--eval", type=bool, help="compute eval.", default=True)
+    args = parser.parse_args()
+    dataset_path = args.dataset_path
+    output_path = args.output_path
+
+    # config
+    config_dict = read_json(args.config_path)
+
+    # model
+    config = SpeakerEncoderConfig(config_dict)
+    config.from_dict(config_dict)
+
+    speaker_encoder = LSTMSpeakerEncoder(
+        config.model_params["input_dim"],
+        config.model_params["proj_dim"],
+        config.model_params["lstm_dim"],
+        config.model_params["num_lstm_layers"],
+    )
+
+    speaker_encoder.load_checkpoint(args.model_path, eval=True, use_cuda=args.use_cuda)
+
+    # preprocess
+    speaker_encoder_ap = AudioProcessor(**config.audio)
+    # normalize the input audio level and trim silences
+    speaker_encoder_ap.do_sound_norm = True
+    speaker_encoder_ap.do_trim_silence = True
+
+    wav_files = get_spk_wavs(dataset_path, output_path)
+
+    # compute speaker embeddings
+    for idx, wav_file in enumerate(tqdm(wav_files)):
+        waveform = speaker_encoder_ap.load_wav(
+            wav_file, sr=speaker_encoder_ap.sample_rate
+        )
+        spec = speaker_encoder_ap.melspectrogram(waveform)
+        spec = torch.from_numpy(spec.T)
+        if args.use_cuda:
+            spec = spec.cuda()
+        spec = spec.unsqueeze(0)
+        embed = speaker_encoder.compute_embedding(spec).detach().cpu().numpy()
+        embed = embed.squeeze()
+        embed_path = wav_file.replace(dataset_path, output_path)
+        embed_path = embed_path.replace(".wav", ".npy")
+        np.save(embed_path, embed, allow_pickle=False)
diff --git a/vi_speaker_center.py b/vi_speaker_center.py
new file mode 100644
index 0000000..62a1e44
--- /dev/null
+++ b/vi_speaker_center.py
@@ -0,0 +1,21 @@
+import os
+import numpy as np
+
+single_id_path = "speaker_embedding"
+center_id_path = "speaker_embedding_center"
+
+os.makedirs(f"./{center_id_path}")
+
+for speaker in os.listdir(single_id_path):
+    if os.path.isdir(f"./{single_id_path}/{speaker}"):
+        print(f"---->{speaker}<----")
+        subfile_num = 0
+        speaker_cen = 0
+        for file in os.listdir(f"./{single_id_path}/{speaker}"):
+            if file.endswith(".npy"):
+                source_embed = np.load(f"./{single_id_path}/{speaker}/{file}")
+                source_embed = source_embed.astype(np.float32)
+                speaker_cen = speaker_cen + source_embed
+                subfile_num = subfile_num + 1
+        speaker_cen = speaker_cen / subfile_num
+        np.save(f"./{center_id_path}/{speaker}.npy", speaker_cen, allow_pickle=False)
diff --git a/vi_speaker_single.py b/vi_speaker_single.py
new file mode 100644
index 0000000..7260f53
--- /dev/null
+++ b/vi_speaker_single.py
@@ -0,0 +1,109 @@
+import re
+import json
+import fsspec
+import torch
+import numpy as np
+import argparse
+
+from argparse import RawTextHelpFormatter
+from speaker_encoder.models.lstm import LSTMSpeakerEncoder
+from speaker_encoder.speaker_encoder_config import SpeakerEncoderConfig
+
+from utils.audio import AudioProcessor
+
+
+def read_json(json_path):
+    config_dict = {}
+    try:
+        with fsspec.open(json_path, "r", encoding="utf-8") as f:
+            data = json.load(f)
+    except json.decoder.JSONDecodeError:
+        # backwards compat.
+        data = read_json_with_comments(json_path)
+    config_dict.update(data)
+    return config_dict
+
+
+def read_json_with_comments(json_path):
+    """for backward compat."""
+    # fallback to json
+    with fsspec.open(json_path, "r", encoding="utf-8") as f:
+        input_str = f.read()
+    # handle comments
+    input_str = re.sub(r"\\\n", "", input_str)
+    input_str = re.sub(r"//.*\n", "\n", input_str)
+    data = json.loads(input_str)
+    return data
+
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser(
+        description="""Compute embedding vectors for each wav file in a dataset.""",
+        formatter_class=RawTextHelpFormatter,
+    )
+    parser.add_argument("model_path", type=str, help="Path to model checkpoint file.")
+    parser.add_argument(
+        "config_path",
+        type=str,
+        help="Path to model config file.",
+    )
+
+    parser.add_argument("-s", "--source", help="input wave", dest="source")
+    parser.add_argument(
+        "-t", "--target", help="output 256d speaker embeddimg", dest="target"
+    )
+
+    parser.add_argument("--use_cuda", type=bool, help="flag to set cuda.", default=True)
+    parser.add_argument("--eval", type=bool, help="compute eval.", default=True)
+
+    args = parser.parse_args()
+    source_file = args.source
+    target_file = args.target
+
+    # config
+    config_dict = read_json(args.config_path)
+    # print(config_dict)
+
+    # model
+    config = SpeakerEncoderConfig(config_dict)
+    config.from_dict(config_dict)
+
+    speaker_encoder = LSTMSpeakerEncoder(
+        config.model_params["input_dim"],
+        config.model_params["proj_dim"],
+        config.model_params["lstm_dim"],
+        config.model_params["num_lstm_layers"],
+    )
+
+    speaker_encoder.load_checkpoint(args.model_path, eval=True, use_cuda=args.use_cuda)
+
+    # preprocess
+    speaker_encoder_ap = AudioProcessor(**config.audio)
+    # normalize the input audio level and trim silences
+    speaker_encoder_ap.do_sound_norm = True
+    speaker_encoder_ap.do_trim_silence = True
+
+    # compute speaker embeddings
+
+    # extract the embedding
+    waveform = speaker_encoder_ap.load_wav(
+        source_file, sr=speaker_encoder_ap.sample_rate
+    )
+    spec = speaker_encoder_ap.melspectrogram(waveform)
+    spec = torch.from_numpy(spec.T)
+    if args.use_cuda:
+        spec = spec.cuda()
+    spec = spec.unsqueeze(0)
+    embed = speaker_encoder.compute_embedding(spec).detach().cpu().numpy()
+    embed = embed.squeeze()
+    # print(embed)
+    # print(embed.size)
+    np.save(target_file, embed, allow_pickle=False)
+
+
+    if hasattr(speaker_encoder, 'module'):
+        state_dict = speaker_encoder.module.state_dict()
+    else:
+        state_dict = speaker_encoder.state_dict()
+        torch.save({'model': state_dict}, "model_small.pth")