mrfakename commited on
Commit
ea89faa
1 Parent(s): b5979c9

Sync from GitHub repo

Browse files

This Space is synced from the GitHub repo: https://github.com/SWivid/F5-TTS. Please submit contributions to the Space there

pyproject.toml CHANGED
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
 
5
  [project]
6
  name = "f5-tts"
7
- version = "0.1.2"
8
  description = "F5-TTS: A Fairytaler that Fakes Fluent and Faithful Speech with Flow Matching"
9
  readme = "README.md"
10
  license = {text = "MIT License"}
@@ -21,6 +21,7 @@ dependencies = [
21
  "datasets",
22
  "ema_pytorch>=0.5.2",
23
  "gradio>=3.45.2",
 
24
  "jieba",
25
  "librosa",
26
  "matplotlib",
@@ -39,7 +40,6 @@ dependencies = [
39
  "vocos",
40
  "wandb",
41
  "x_transformers>=1.31.14",
42
- "hydra-core>=1.3.0",
43
  ]
44
 
45
  [project.optional-dependencies]
 
4
 
5
  [project]
6
  name = "f5-tts"
7
+ version = "0.2.0"
8
  description = "F5-TTS: A Fairytaler that Fakes Fluent and Faithful Speech with Flow Matching"
9
  readme = "README.md"
10
  license = {text = "MIT License"}
 
21
  "datasets",
22
  "ema_pytorch>=0.5.2",
23
  "gradio>=3.45.2",
24
+ "hydra-core>=1.3.0",
25
  "jieba",
26
  "librosa",
27
  "matplotlib",
 
40
  "vocos",
41
  "wandb",
42
  "x_transformers>=1.31.14",
 
43
  ]
44
 
45
  [project.optional-dependencies]
src/f5_tts/configs/E2TTS_Base_train.yaml CHANGED
@@ -3,41 +3,41 @@ hydra:
3
  dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
4
 
5
  datasets:
6
- name: Emilia_ZH_EN # dataset name
7
  batch_size_per_gpu: 38400 # 8 GPUs, 8 * 38400 = 307200
8
- batch_size_type: frame # "frame" or "sample"
9
  max_samples: 64 # max sequences per batch if use frame-wise batch_size. we set 32 for small models, 64 for base models
10
- num_workers: 16 # number of workers
11
 
12
  optim:
13
- epochs: 15 # max epochs
14
- learning_rate: 7.5e-5 # learning rate
15
  num_warmup_updates: 20000 # warmup steps
16
  grad_accumulation_steps: 1 # note: updates = steps / grad_accumulation_steps
17
- max_grad_norm: 1.0 # gradient clipping
18
- bnb_optimizer: False # use bnb optimizer or not
19
 
20
  model:
21
- name: E2TTS_Base # model name
22
- tokenizer: pinyin # tokenizer type
23
  tokenizer_path: None # if tokenizer = 'custom', define the path to the tokenizer you want to use (should be vocab.txt)
24
  arch:
25
- dim: 1024 # model dimension
26
- depth: 24 # number of transformer layers
27
- heads: 16 # number of transformer heads
28
- ff_mult: 4 # ff layer expansion
29
  mel_spec:
30
- target_sample_rate: 24000 # target sample rate
31
- n_mel_channels: 100 # mel channel
32
- hop_length: 256 # hop length
33
- win_length: 1024 # window length
34
- n_fft: 1024 # fft length
35
  mel_spec_type: vocos # 'vocos' or 'bigvgan'
36
- is_local_vocoder: False # use local vocoder or not
37
- local_vocoder_path: None # path to local vocoder
38
 
39
  ckpts:
40
- logger: wandb # wandb | tensorboard | None
41
- save_per_updates: 50000 # save checkpoint per steps
42
- last_per_steps: 5000 # save last checkpoint per steps
43
  save_dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
 
3
  dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
4
 
5
  datasets:
6
+ name: Emilia_ZH_EN # dataset name
7
  batch_size_per_gpu: 38400 # 8 GPUs, 8 * 38400 = 307200
8
+ batch_size_type: frame # "frame" or "sample"
9
  max_samples: 64 # max sequences per batch if use frame-wise batch_size. we set 32 for small models, 64 for base models
10
+ num_workers: 16
11
 
12
  optim:
13
+ epochs: 15
14
+ learning_rate: 7.5e-5
15
  num_warmup_updates: 20000 # warmup steps
16
  grad_accumulation_steps: 1 # note: updates = steps / grad_accumulation_steps
17
+ max_grad_norm: 1.0 # gradient clipping
18
+ bnb_optimizer: False # use bnb 8bit AdamW optimizer or not
19
 
20
  model:
21
+ name: E2TTS_Base
22
+ tokenizer: pinyin
23
  tokenizer_path: None # if tokenizer = 'custom', define the path to the tokenizer you want to use (should be vocab.txt)
24
  arch:
25
+ dim: 1024
26
+ depth: 24
27
+ heads: 16
28
+ ff_mult: 4
29
  mel_spec:
30
+ target_sample_rate: 24000
31
+ n_mel_channels: 100
32
+ hop_length: 256
33
+ win_length: 1024
34
+ n_fft: 1024
35
  mel_spec_type: vocos # 'vocos' or 'bigvgan'
36
+ is_local_vocoder: False # use local offline vocoder ckpt or not
37
+ local_vocoder_path: None # path to local vocoder
38
 
39
  ckpts:
40
+ logger: wandb # wandb | tensorboard | None
41
+ save_per_updates: 50000 # save checkpoint per steps
42
+ last_per_steps: 5000 # save last checkpoint per steps
43
  save_dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
src/f5_tts/configs/E2TTS_Small_train.yaml CHANGED
@@ -5,9 +5,9 @@ hydra:
5
  datasets:
6
  name: Emilia_ZH_EN
7
  batch_size_per_gpu: 38400 # 8 GPUs, 8 * 38400 = 307200
8
- batch_size_type: frame # "frame" or "sample"
9
  max_samples: 64 # max sequences per batch if use frame-wise batch_size. we set 32 for small models, 64 for base models
10
- num_workers: 16 # number of workers
11
 
12
  optim:
13
  epochs: 15
@@ -37,7 +37,7 @@ model:
37
  local_vocoder_path: None
38
 
39
  ckpts:
40
- logger: wandb # wandb | tensorboard | None
41
- save_per_updates: 50000 # save checkpoint per steps
42
- last_per_steps: 5000 # save last checkpoint per steps
43
  save_dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
 
5
  datasets:
6
  name: Emilia_ZH_EN
7
  batch_size_per_gpu: 38400 # 8 GPUs, 8 * 38400 = 307200
8
+ batch_size_type: frame # "frame" or "sample"
9
  max_samples: 64 # max sequences per batch if use frame-wise batch_size. we set 32 for small models, 64 for base models
10
+ num_workers: 16
11
 
12
  optim:
13
  epochs: 15
 
37
  local_vocoder_path: None
38
 
39
  ckpts:
40
+ logger: wandb # wandb | tensorboard | None
41
+ save_per_updates: 50000 # save checkpoint per steps
42
+ last_per_steps: 5000 # save last checkpoint per steps
43
  save_dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
src/f5_tts/configs/F5TTS_Base_train.yaml CHANGED
@@ -3,43 +3,43 @@ hydra:
3
  dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
4
 
5
  datasets:
6
- name: Emilia_ZH_EN # dataset name
7
  batch_size_per_gpu: 38400 # 8 GPUs, 8 * 38400 = 307200
8
- batch_size_type: frame # "frame" or "sample"
9
  max_samples: 64 # max sequences per batch if use frame-wise batch_size. we set 32 for small models, 64 for base models
10
- num_workers: 16 # number of workers
11
 
12
  optim:
13
- epochs: 15 # max epochs
14
- learning_rate: 7.5e-5 # learning rate
15
  num_warmup_updates: 20000 # warmup steps
16
  grad_accumulation_steps: 1 # note: updates = steps / grad_accumulation_steps
17
- max_grad_norm: 1.0 # gradient clipping
18
- bnb_optimizer: False # use bnb optimizer or not
19
 
20
  model:
21
- name: F5TTS_Base # model name
22
- tokenizer: pinyin # tokenizer type
23
  tokenizer_path: None # if tokenizer = 'custom', define the path to the tokenizer you want to use (should be vocab.txt)
24
  arch:
25
- dim: 1024 # model dim
26
- depth: 22 # model depth
27
- heads: 16 # model heads
28
- ff_mult: 2 # feedforward expansion
29
- text_dim: 512 # text encoder dim
30
- conv_layers: 4 # convolution layers
31
  mel_spec:
32
- target_sample_rate: 24000 # target sample rate
33
- n_mel_channels: 100 # mel channel
34
- hop_length: 256 # hop length
35
- win_length: 1024 # window length
36
- n_fft: 1024 # fft length
37
  mel_spec_type: vocos # 'vocos' or 'bigvgan'
38
- is_local_vocoder: False # use local vocoder or not
39
- local_vocoder_path: None # local vocoder path
40
 
41
  ckpts:
42
- logger: wandb # wandb | tensorboard | None
43
- save_per_updates: 50000 # save checkpoint per steps
44
- last_per_steps: 5000 # save last checkpoint per steps
45
  save_dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
 
3
  dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
4
 
5
  datasets:
6
+ name: Emilia_ZH_EN # dataset name
7
  batch_size_per_gpu: 38400 # 8 GPUs, 8 * 38400 = 307200
8
+ batch_size_type: frame # "frame" or "sample"
9
  max_samples: 64 # max sequences per batch if use frame-wise batch_size. we set 32 for small models, 64 for base models
10
+ num_workers: 16
11
 
12
  optim:
13
+ epochs: 15
14
+ learning_rate: 7.5e-5
15
  num_warmup_updates: 20000 # warmup steps
16
  grad_accumulation_steps: 1 # note: updates = steps / grad_accumulation_steps
17
+ max_grad_norm: 1.0 # gradient clipping
18
+ bnb_optimizer: False # use bnb 8bit AdamW optimizer or not
19
 
20
  model:
21
+ name: F5TTS_Base # model name
22
+ tokenizer: pinyin # tokenizer type
23
  tokenizer_path: None # if tokenizer = 'custom', define the path to the tokenizer you want to use (should be vocab.txt)
24
  arch:
25
+ dim: 1024
26
+ depth: 22
27
+ heads: 16
28
+ ff_mult: 2
29
+ text_dim: 512
30
+ conv_layers: 4
31
  mel_spec:
32
+ target_sample_rate: 24000
33
+ n_mel_channels: 100
34
+ hop_length: 256
35
+ win_length: 1024
36
+ n_fft: 1024
37
  mel_spec_type: vocos # 'vocos' or 'bigvgan'
38
+ is_local_vocoder: False # use local offline vocoder ckpt or not
39
+ local_vocoder_path: None # local vocoder path
40
 
41
  ckpts:
42
+ logger: wandb # wandb | tensorboard | None
43
+ save_per_updates: 50000 # save checkpoint per steps
44
+ last_per_steps: 5000 # save last checkpoint per steps
45
  save_dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
src/f5_tts/configs/F5TTS_Small_train.yaml CHANGED
@@ -5,17 +5,17 @@ hydra:
5
  datasets:
6
  name: Emilia_ZH_EN
7
  batch_size_per_gpu: 38400 # 8 GPUs, 8 * 38400 = 307200
8
- batch_size_type: frame # "frame" or "sample"
9
  max_samples: 64 # max sequences per batch if use frame-wise batch_size. we set 32 for small models, 64 for base models
10
- num_workers: 16 # number of workers
11
 
12
  optim:
13
  epochs: 15
14
  learning_rate: 7.5e-5
15
  num_warmup_updates: 20000 # warmup steps
16
  grad_accumulation_steps: 1 # note: updates = steps / grad_accumulation_steps
17
- max_grad_norm: 1.0
18
- bnb_optimizer: False
19
 
20
  model:
21
  name: F5TTS_Small
@@ -39,7 +39,7 @@ model:
39
  local_vocoder_path: None
40
 
41
  ckpts:
42
- logger: wandb # wandb | tensorboard | None
43
- save_per_updates: 50000 # save checkpoint per steps
44
- last_per_steps: 5000 # save last checkpoint per steps
45
  save_dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
 
5
  datasets:
6
  name: Emilia_ZH_EN
7
  batch_size_per_gpu: 38400 # 8 GPUs, 8 * 38400 = 307200
8
+ batch_size_type: frame # "frame" or "sample"
9
  max_samples: 64 # max sequences per batch if use frame-wise batch_size. we set 32 for small models, 64 for base models
10
+ num_workers: 16
11
 
12
  optim:
13
  epochs: 15
14
  learning_rate: 7.5e-5
15
  num_warmup_updates: 20000 # warmup steps
16
  grad_accumulation_steps: 1 # note: updates = steps / grad_accumulation_steps
17
+ max_grad_norm: 1.0 # gradient clipping
18
+ bnb_optimizer: False # use bnb 8bit AdamW optimizer or not
19
 
20
  model:
21
  name: F5TTS_Small
 
39
  local_vocoder_path: None
40
 
41
  ckpts:
42
+ logger: wandb # wandb | tensorboard | None
43
+ save_per_updates: 50000 # save checkpoint per steps
44
+ last_per_steps: 5000 # save last checkpoint per steps
45
  save_dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
src/f5_tts/train/README.md CHANGED
@@ -2,9 +2,9 @@
2
 
3
  ## Prepare Dataset
4
 
5
- Example data processing scripts for Emilia and Wenetspeech4TTS, and you may tailor your own one along with a Dataset class in `src/f5_tts/model/dataset.py`.
6
 
7
- ### 1. Datasets used for pretrained models
8
  Download corresponding dataset first, and fill in the path in scripts.
9
 
10
  ```bash
@@ -38,7 +38,9 @@ Once your datasets are prepared, you can start the training process.
38
  # setup accelerate config, e.g. use multi-gpu ddp, fp16
39
  # will be to: ~/.cache/huggingface/accelerate/default_config.yaml
40
  accelerate config
41
- accelerate launch src/f5_tts/train/train.py --config-name F5TTS_Base_train.yaml # F5TTS_Base_train.yaml | E2TTS_Base_train.yaml
 
 
42
  ```
43
 
44
  ### 2. Finetuning practice
 
2
 
3
  ## Prepare Dataset
4
 
5
+ Example data processing scripts, and you may tailor your own one along with a Dataset class in `src/f5_tts/model/dataset.py`.
6
 
7
+ ### 1. Some specific Datasets preparing scripts
8
  Download corresponding dataset first, and fill in the path in scripts.
9
 
10
  ```bash
 
38
  # setup accelerate config, e.g. use multi-gpu ddp, fp16
39
  # will be to: ~/.cache/huggingface/accelerate/default_config.yaml
40
  accelerate config
41
+
42
+ # .yaml files are under src/f5_tts/configs directory
43
+ accelerate launch src/f5_tts/train/train.py --config-name F5TTS_Base_train.yaml
44
  ```
45
 
46
  ### 2. Finetuning practice
src/f5_tts/train/train.py CHANGED
@@ -1,4 +1,5 @@
1
  # training script.
 
2
  import os
3
  from importlib.resources import files
4
 
@@ -8,7 +9,7 @@ from f5_tts.model import CFM, DiT, Trainer, UNetT
8
  from f5_tts.model.dataset import load_dataset
9
  from f5_tts.model.utils import get_tokenizer
10
 
11
- os.chdir(str(files("f5_tts").joinpath("../..")))
12
 
13
 
14
  @hydra.main(version_base="1.3", config_path=str(files("f5_tts").joinpath("configs")), config_name=None)
 
1
  # training script.
2
+
3
  import os
4
  from importlib.resources import files
5
 
 
9
  from f5_tts.model.dataset import load_dataset
10
  from f5_tts.model.utils import get_tokenizer
11
 
12
+ os.chdir(str(files("f5_tts").joinpath("../.."))) # change working directory to root of project (local editable)
13
 
14
 
15
  @hydra.main(version_base="1.3", config_path=str(files("f5_tts").joinpath("configs")), config_name=None)