Spaces:

mrfakename
/

E2-F5-TTS

Running on Zero

App Files Files Community

mrfakename commited on Nov 27, 2024

Commit

ea89faa

•

1 Parent(s): b5979c9

Sync from GitHub repo

Browse files

This Space is synced from the GitHub repo: https://github.com/SWivid/F5-TTS. Please submit contributions to the Space there

Files changed (7) hide show

pyproject.toml +2 -2
src/f5_tts/configs/E2TTS_Base_train.yaml +23 -23
src/f5_tts/configs/E2TTS_Small_train.yaml +5 -5
src/f5_tts/configs/F5TTS_Base_train.yaml +25 -25
src/f5_tts/configs/F5TTS_Small_train.yaml +7 -7
src/f5_tts/train/README.md +5 -3
src/f5_tts/train/train.py +2 -1

pyproject.toml CHANGED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "f5-tts"
-version = "0.1.2"
 description = "F5-TTS: A Fairytaler that Fakes Fluent and Faithful Speech with Flow Matching"
 readme = "README.md"
 license = {text = "MIT License"}
@@ -21,6 +21,7 @@ dependencies = [
     "datasets",
     "ema_pytorch>=0.5.2",
     "gradio>=3.45.2",
     "jieba",
     "librosa",
     "matplotlib",
@@ -39,7 +40,6 @@ dependencies = [
     "vocos",
     "wandb",
     "x_transformers>=1.31.14",
-    "hydra-core>=1.3.0",
 ]
 [project.optional-dependencies]

 [project]
 name = "f5-tts"
+version = "0.2.0"
 description = "F5-TTS: A Fairytaler that Fakes Fluent and Faithful Speech with Flow Matching"
 readme = "README.md"
 license = {text = "MIT License"}
     "datasets",
     "ema_pytorch>=0.5.2",
     "gradio>=3.45.2",
+    "hydra-core>=1.3.0",
     "jieba",
     "librosa",
     "matplotlib",
     "vocos",
     "wandb",
     "x_transformers>=1.31.14",
 ]
 [project.optional-dependencies]

src/f5_tts/configs/E2TTS_Base_train.yaml CHANGED Viewed

@@ -3,41 +3,41 @@ hydra:
     dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
 datasets:
-  name: Emilia_ZH_EN # dataset name
   batch_size_per_gpu: 38400  # 8 GPUs, 8 * 38400 = 307200
-  batch_size_type: frame # "frame" or "sample"
   max_samples: 64  # max sequences per batch if use frame-wise batch_size. we set 32 for small models, 64 for base models
-  num_workers: 16 # number of workers
 optim:
-  epochs: 15 # max epochs
-  learning_rate: 7.5e-5 # learning rate
   num_warmup_updates: 20000  # warmup steps
   grad_accumulation_steps: 1  # note: updates = steps / grad_accumulation_steps
-  max_grad_norm: 1.0 # gradient clipping
-  bnb_optimizer: False # use bnb optimizer or not
 model:
-  name: E2TTS_Base # model name
-  tokenizer: pinyin # tokenizer type
   tokenizer_path: None  # if tokenizer = 'custom', define the path to the tokenizer you want to use (should be vocab.txt)
   arch:
-    dim: 1024 # model dimension
-    depth: 24 # number of transformer layers
-    heads: 16 # number of transformer heads
-    ff_mult: 4 # ff layer expansion
   mel_spec:
-    target_sample_rate: 24000 # target sample rate
-    n_mel_channels: 100 # mel channel
-    hop_length: 256 # hop length
-    win_length: 1024 # window length
-    n_fft: 1024 # fft length
     mel_spec_type: vocos  # 'vocos' or 'bigvgan'
-    is_local_vocoder: False # use local vocoder or not
-    local_vocoder_path: None # path to local vocoder
 ckpts:
-  logger: wandb # wandb | tensorboard | None
-  save_per_updates: 50000 # save checkpoint per steps
-  last_per_steps: 5000 # save last checkpoint per steps
   save_dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}

     dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
 datasets:
+  name: Emilia_ZH_EN  # dataset name
   batch_size_per_gpu: 38400  # 8 GPUs, 8 * 38400 = 307200
+  batch_size_type: frame  # "frame" or "sample"
   max_samples: 64  # max sequences per batch if use frame-wise batch_size. we set 32 for small models, 64 for base models
+  num_workers: 16
 optim:
+  epochs: 15
+  learning_rate: 7.5e-5
   num_warmup_updates: 20000  # warmup steps
   grad_accumulation_steps: 1  # note: updates = steps / grad_accumulation_steps
+  max_grad_norm: 1.0  # gradient clipping
+  bnb_optimizer: False  # use bnb 8bit AdamW optimizer or not
 model:
+  name: E2TTS_Base
+  tokenizer: pinyin
   tokenizer_path: None  # if tokenizer = 'custom', define the path to the tokenizer you want to use (should be vocab.txt)
   arch:
+    dim: 1024
+    depth: 24
+    heads: 16
+    ff_mult: 4
   mel_spec:
+    target_sample_rate: 24000
+    n_mel_channels: 100
+    hop_length: 256
+    win_length: 1024
+    n_fft: 1024
     mel_spec_type: vocos  # 'vocos' or 'bigvgan'
+    is_local_vocoder: False  # use local offline vocoder ckpt or not
+    local_vocoder_path: None  # path to local vocoder
 ckpts:
+  logger: wandb  # wandb | tensorboard | None
+  save_per_updates: 50000  # save checkpoint per steps
+  last_per_steps: 5000  # save last checkpoint per steps
   save_dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}

src/f5_tts/configs/E2TTS_Small_train.yaml CHANGED Viewed

@@ -5,9 +5,9 @@ hydra:
 datasets:
   name: Emilia_ZH_EN
   batch_size_per_gpu: 38400  # 8 GPUs, 8 * 38400 = 307200
-  batch_size_type: frame # "frame" or "sample"
   max_samples: 64  # max sequences per batch if use frame-wise batch_size. we set 32 for small models, 64 for base models
-  num_workers: 16 # number of workers
 optim:
   epochs: 15
@@ -37,7 +37,7 @@ model:
     local_vocoder_path: None
 ckpts:
-  logger: wandb # wandb | tensorboard | None
-  save_per_updates: 50000 # save checkpoint per steps
-  last_per_steps: 5000 # save last checkpoint per steps
   save_dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}

 datasets:
   name: Emilia_ZH_EN
   batch_size_per_gpu: 38400  # 8 GPUs, 8 * 38400 = 307200
+  batch_size_type: frame  # "frame" or "sample"
   max_samples: 64  # max sequences per batch if use frame-wise batch_size. we set 32 for small models, 64 for base models
+  num_workers: 16
 optim:
   epochs: 15
     local_vocoder_path: None
 ckpts:
+  logger: wandb  # wandb | tensorboard | None
+  save_per_updates: 50000  # save checkpoint per steps
+  last_per_steps: 5000  # save last checkpoint per steps
   save_dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}

src/f5_tts/configs/F5TTS_Base_train.yaml CHANGED Viewed

@@ -3,43 +3,43 @@ hydra:
     dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
 datasets:
-  name: Emilia_ZH_EN # dataset name
   batch_size_per_gpu: 38400  # 8 GPUs, 8 * 38400 = 307200
-  batch_size_type: frame # "frame" or "sample"
   max_samples: 64  # max sequences per batch if use frame-wise batch_size. we set 32 for small models, 64 for base models
-  num_workers: 16 # number of workers
 optim:
-  epochs: 15 # max epochs
-  learning_rate: 7.5e-5 # learning rate
   num_warmup_updates: 20000  # warmup steps
   grad_accumulation_steps: 1  # note: updates = steps / grad_accumulation_steps
-  max_grad_norm: 1.0 # gradient clipping
-  bnb_optimizer: False # use bnb optimizer or not
 model:
-  name: F5TTS_Base # model name
-  tokenizer: pinyin # tokenizer type
   tokenizer_path: None  # if tokenizer = 'custom', define the path to the tokenizer you want to use (should be vocab.txt)
   arch:
-    dim: 1024 # model dim
-    depth: 22 # model depth
-    heads: 16 # model heads
-    ff_mult: 2 # feedforward expansion
-    text_dim: 512 # text encoder dim
-    conv_layers: 4 # convolution layers
   mel_spec:
-    target_sample_rate: 24000 # target sample rate
-    n_mel_channels: 100 # mel channel
-    hop_length: 256 # hop length
-    win_length: 1024 # window length
-    n_fft: 1024 # fft length
     mel_spec_type: vocos  # 'vocos' or 'bigvgan'
-    is_local_vocoder: False # use local vocoder or not
-    local_vocoder_path: None # local vocoder path
 ckpts:
-  logger: wandb # wandb | tensorboard | None
-  save_per_updates: 50000 # save checkpoint per steps
-  last_per_steps: 5000 # save last checkpoint per steps
   save_dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}

     dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
 datasets:
+  name: Emilia_ZH_EN  # dataset name
   batch_size_per_gpu: 38400  # 8 GPUs, 8 * 38400 = 307200
+  batch_size_type: frame  # "frame" or "sample"
   max_samples: 64  # max sequences per batch if use frame-wise batch_size. we set 32 for small models, 64 for base models
+  num_workers: 16
 optim:
+  epochs: 15
+  learning_rate: 7.5e-5
   num_warmup_updates: 20000  # warmup steps
   grad_accumulation_steps: 1  # note: updates = steps / grad_accumulation_steps
+  max_grad_norm: 1.0  # gradient clipping
+  bnb_optimizer: False  # use bnb 8bit AdamW optimizer or not
 model:
+  name: F5TTS_Base  # model name
+  tokenizer: pinyin  # tokenizer type
   tokenizer_path: None  # if tokenizer = 'custom', define the path to the tokenizer you want to use (should be vocab.txt)
   arch:
+    dim: 1024
+    depth: 22
+    heads: 16
+    ff_mult: 2
+    text_dim: 512
+    conv_layers: 4
   mel_spec:
+    target_sample_rate: 24000
+    n_mel_channels: 100
+    hop_length: 256
+    win_length: 1024
+    n_fft: 1024
     mel_spec_type: vocos  # 'vocos' or 'bigvgan'
+    is_local_vocoder: False  # use local offline vocoder ckpt or not
+    local_vocoder_path: None  # local vocoder path
 ckpts:
+  logger: wandb  # wandb | tensorboard | None
+  save_per_updates: 50000  # save checkpoint per steps
+  last_per_steps: 5000  # save last checkpoint per steps
   save_dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}

src/f5_tts/configs/F5TTS_Small_train.yaml CHANGED Viewed

@@ -5,17 +5,17 @@ hydra:
 datasets:
   name: Emilia_ZH_EN
   batch_size_per_gpu: 38400  # 8 GPUs, 8 * 38400 = 307200
-  batch_size_type: frame # "frame" or "sample"
   max_samples: 64  # max sequences per batch if use frame-wise batch_size. we set 32 for small models, 64 for base models
-  num_workers: 16 # number of workers
 optim:
   epochs: 15
   learning_rate: 7.5e-5
   num_warmup_updates: 20000  # warmup steps
   grad_accumulation_steps: 1  # note: updates = steps / grad_accumulation_steps
-  max_grad_norm: 1.0
-  bnb_optimizer: False
 model:
   name: F5TTS_Small
@@ -39,7 +39,7 @@ model:
     local_vocoder_path: None
 ckpts:
-  logger: wandb # wandb | tensorboard | None
-  save_per_updates: 50000 # save checkpoint per steps
-  last_per_steps: 5000 # save last checkpoint per steps
   save_dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}

 datasets:
   name: Emilia_ZH_EN
   batch_size_per_gpu: 38400  # 8 GPUs, 8 * 38400 = 307200
+  batch_size_type: frame  # "frame" or "sample"
   max_samples: 64  # max sequences per batch if use frame-wise batch_size. we set 32 for small models, 64 for base models
+  num_workers: 16
 optim:
   epochs: 15
   learning_rate: 7.5e-5
   num_warmup_updates: 20000  # warmup steps
   grad_accumulation_steps: 1  # note: updates = steps / grad_accumulation_steps
+  max_grad_norm: 1.0  # gradient clipping
+  bnb_optimizer: False  # use bnb 8bit AdamW optimizer or not
 model:
   name: F5TTS_Small
     local_vocoder_path: None
 ckpts:
+  logger: wandb  # wandb | tensorboard | None
+  save_per_updates: 50000  # save checkpoint per steps
+  last_per_steps: 5000  # save last checkpoint per steps
   save_dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}

src/f5_tts/train/README.md CHANGED Viewed

@@ -2,9 +2,9 @@
 ## Prepare Dataset
-Example data processing scripts for Emilia and Wenetspeech4TTS, and you may tailor your own one along with a Dataset class in `src/f5_tts/model/dataset.py`.
-### 1. Datasets used for pretrained models
 Download corresponding dataset first, and fill in the path in scripts.
 ```bash
@@ -38,7 +38,9 @@ Once your datasets are prepared, you can start the training process.
 # setup accelerate config, e.g. use multi-gpu ddp, fp16
 # will be to: ~/.cache/huggingface/accelerate/default_config.yaml
 accelerate config
-accelerate launch src/f5_tts/train/train.py --config-name F5TTS_Base_train.yaml # F5TTS_Base_train.yaml | E2TTS_Base_train.yaml
 ```
 ### 2. Finetuning practice

 ## Prepare Dataset
+Example data processing scripts, and you may tailor your own one along with a Dataset class in `src/f5_tts/model/dataset.py`.
+### 1. Some specific Datasets preparing scripts
 Download corresponding dataset first, and fill in the path in scripts.
 ```bash
 # setup accelerate config, e.g. use multi-gpu ddp, fp16
 # will be to: ~/.cache/huggingface/accelerate/default_config.yaml
 accelerate config
+# .yaml files are under src/f5_tts/configs directory
+accelerate launch src/f5_tts/train/train.py --config-name F5TTS_Base_train.yaml
 ```
 ### 2. Finetuning practice

src/f5_tts/train/train.py CHANGED Viewed

@@ -1,4 +1,5 @@
 # training script.
 import os
 from importlib.resources import files
@@ -8,7 +9,7 @@ from f5_tts.model import CFM, DiT, Trainer, UNetT
 from f5_tts.model.dataset import load_dataset
 from f5_tts.model.utils import get_tokenizer
-os.chdir(str(files("f5_tts").joinpath("../..")))
 @hydra.main(version_base="1.3", config_path=str(files("f5_tts").joinpath("configs")), config_name=None)

 # training script.
 import os
 from importlib.resources import files
 from f5_tts.model.dataset import load_dataset
 from f5_tts.model.utils import get_tokenizer
+os.chdir(str(files("f5_tts").joinpath("../..")))  # change working directory to root of project (local editable)
 @hydra.main(version_base="1.3", config_path=str(files("f5_tts").joinpath("configs")), config_name=None)