Spaces:
Running
Running
| # ================================================================================================== | |
| # DEEPFAKE AUDIO - encoder_train.py (Neural Identity Orchestration) | |
| # ================================================================================================== | |
| # | |
| # π DESCRIPTION | |
| # This script manages the training lifecycle of the Speaker Encoder. It optimizes a | |
| # d-vector based neural network to minimize the GE2E (Generalized End-to-End) loss. | |
| # The goal is to maximize the similarity between embeddings of the same speaker | |
| # while minimizing similarity between different speakers, enabling high-fidelity | |
| # zero-shot voice cloning. | |
| # | |
| # π€ AUTHORS | |
| # - Amey Thakur (https://github.com/Amey-Thakur) | |
| # - Mega Satish (https://github.com/msatmod) | |
| # | |
| # π€π» CREDITS | |
| # Original Real-Time Voice Cloning methodology by CorentinJ | |
| # Repository: https://github.com/CorentinJ/Real-Time-Voice-Cloning | |
| # | |
| # π PROJECT LINKS | |
| # Repository: https://github.com/Amey-Thakur/DEEPFAKE-AUDIO | |
| # Video Demo: https://youtu.be/i3wnBcbHDbs | |
| # Research: https://github.com/Amey-Thakur/DEEPFAKE-AUDIO/blob/main/DEEPFAKE-AUDIO.ipynb | |
| # | |
| # π LICENSE | |
| # Released under the MIT License | |
| # Release Date: 2021-02-06 | |
| # ================================================================================================== | |
| from utils.argutils import print_args | |
| from encoder.train import train | |
| from pathlib import Path | |
| import argparse | |
| if __name__ == "__main__": | |
| # --- INTERFACE COMMANDS --- | |
| parser = argparse.ArgumentParser( | |
| description="Encoder Training Hub: Optimizing identity embeddings from preprocessed data.", | |
| formatter_class=argparse.ArgumentDefaultsHelpFormatter | |
| ) | |
| # --- SESSION DEFINITION --- | |
| parser.add_argument("run_id", type=str, | |
| help="Identifier for this training experiment. Models and logs will be organized under this ID.") | |
| parser.add_argument("clean_data_root", type=Path, | |
| help="Root path to the mel-spectrograms generated by encoder_preprocess.py.") | |
| # --- STORAGE & TELEMETRY --- | |
| parser.add_argument("-m", "--models_dir", type=Path, default="saved_models", | |
| help="Parent directory for serialized weights, backups, and diagnostic plots.") | |
| parser.add_argument("-v", "--vis_every", type=int, default=10, | |
| help="Iteration frequency for updating training curves and loss metrics.") | |
| parser.add_argument("-u", "--umap_every", type=int, default=100, | |
| help="Frequency of UMAP projections to visualize speaker cluster separation.") | |
| parser.add_argument("-s", "--save_every", type=int, default=500, | |
| help="Step interval for materializing model weights (.pt) on disk.") | |
| parser.add_argument("-b", "--backup_every", type=int, default=7500, | |
| help="Interval for creating immutable rolling backups of the model state.") | |
| parser.add_argument("-f", "--force_restart", action="store_true", | |
| help="Bypass existing checkpoints and initialize weights from distribution (restart from scratch).") | |
| # --- VISUALIZATION SERVER --- | |
| parser.add_argument("--visdom_server", type=str, default="http://localhost", | |
| help="Remote address of the Visdom dashboard server.") | |
| parser.add_argument("--no_visdom", action="store_true", | |
| help="Inhibit rich visual telemetry (not recommended for production monitoring).") | |
| args = parser.parse_args() | |
| # --- EXECUTION --- | |
| print_args(args, parser) | |
| print("π€π» Scholarly Partnership: Amey Thakur & Mega Satish") | |
| print("π Initiating Neural Training Pipeline - Monitoring d-vector clusters...") | |
| # Delegate to the internal training engine. | |
| train(**vars(args)) | |