MimicYouFree

Sleeping

App Files Files

VX3 commited on Dec 17, 2024

Commit

197b46f

verified ·

2 Parent(s): b99d0d9 1f91df5

Merge branch #mrfakename/E2-F5-TTS' into 'VX3/MimicYouFree'

Browse files

Files changed (25) hide show

README_REPO.md +3 -3
app.py +115 -78
pyproject.toml +2 -1
src/f5_tts/configs/E2TTS_Base_train.yaml +44 -0
src/f5_tts/configs/E2TTS_Small_train.yaml +44 -0
src/f5_tts/configs/F5TTS_Base_train.yaml +47 -0
src/f5_tts/configs/F5TTS_Small_train.yaml +47 -0
src/f5_tts/eval/README.md +9 -6
src/f5_tts/eval/eval_infer_batch.py +2 -2
src/f5_tts/eval/eval_librispeech_test_clean.py +77 -54
src/f5_tts/eval/eval_seedtts_testset.py +76 -56
src/f5_tts/eval/eval_utmos.py +44 -0
src/f5_tts/eval/utils_eval.py +16 -8
src/f5_tts/infer/README.md +5 -0
src/f5_tts/infer/SHARED.md +93 -21
src/f5_tts/infer/examples/basic/basic.toml +1 -1
src/f5_tts/infer/examples/multi/story.toml +1 -0
src/f5_tts/infer/infer_cli.py +181 -51
src/f5_tts/infer/utils_infer.py +12 -3
src/f5_tts/model/backbones/dit.py +15 -1
src/f5_tts/model/trainer.py +1 -1
src/f5_tts/model/utils.py +22 -16
src/f5_tts/train/README.md +13 -3
src/f5_tts/train/datasets/prepare_ljspeech.py +65 -0
src/f5_tts/train/train.py +39 -67

README_REPO.md CHANGED Viewed

@@ -147,11 +147,11 @@ Note: Some model components have linting exceptions for E722 to accommodate tens
 ## Acknowledgements
 - [E2-TTS](https://arxiv.org/abs/2406.18009) brilliant work, simple and effective
-- [Emilia](https://arxiv.org/abs/2407.05361), [WenetSpeech4TTS](https://arxiv.org/abs/2406.05763) valuable datasets
 - [lucidrains](https://github.com/lucidrains) initial CFM structure with also [bfs18](https://github.com/bfs18) for discussion
 - [SD3](https://arxiv.org/abs/2403.03206) & [Hugging Face diffusers](https://github.com/huggingface/diffusers) DiT and MMDiT code structure
-- [torchdiffeq](https://github.com/rtqichen/torchdiffeq) as ODE solver, [Vocos](https://huggingface.co/charactr/vocos-mel-24khz) as vocoder
-- [FunASR](https://github.com/modelscope/FunASR), [faster-whisper](https://github.com/SYSTRAN/faster-whisper), [UniSpeech](https://github.com/microsoft/UniSpeech) for evaluation tools
 - [ctc-forced-aligner](https://github.com/MahmoudAshraf97/ctc-forced-aligner) for speech edit test
 - [mrfakename](https://x.com/realmrfakename) huggingface space demo ~
 - [f5-tts-mlx](https://github.com/lucasnewman/f5-tts-mlx/tree/main) Implementation with MLX framework by [Lucas Newman](https://github.com/lucasnewman)

 ## Acknowledgements
 - [E2-TTS](https://arxiv.org/abs/2406.18009) brilliant work, simple and effective
+- [Emilia](https://arxiv.org/abs/2407.05361), [WenetSpeech4TTS](https://arxiv.org/abs/2406.05763), [LibriTTS](https://arxiv.org/abs/1904.02882), [LJSpeech](https://keithito.com/LJ-Speech-Dataset/) valuable datasets
 - [lucidrains](https://github.com/lucidrains) initial CFM structure with also [bfs18](https://github.com/bfs18) for discussion
 - [SD3](https://arxiv.org/abs/2403.03206) & [Hugging Face diffusers](https://github.com/huggingface/diffusers) DiT and MMDiT code structure
+- [torchdiffeq](https://github.com/rtqichen/torchdiffeq) as ODE solver, [Vocos](https://huggingface.co/charactr/vocos-mel-24khz) and [BigVGAN](https://github.com/NVIDIA/BigVGAN) as vocoder
+- [FunASR](https://github.com/modelscope/FunASR), [faster-whisper](https://github.com/SYSTRAN/faster-whisper), [UniSpeech](https://github.com/microsoft/UniSpeech), [SpeechMOS](https://github.com/tarepan/SpeechMOS) for evaluation tools
 - [ctc-forced-aligner](https://github.com/MahmoudAshraf97/ctc-forced-aligner) for speech edit test
 - [mrfakename](https://x.com/realmrfakename) huggingface space demo ~
 - [f5-tts-mlx](https://github.com/lucasnewman/f5-tts-mlx/tree/main) Implementation with MLX framework by [Lucas Newman](https://github.com/lucasnewman)

app.py CHANGED Viewed

@@ -1,6 +1,7 @@
 # ruff: noqa: E402
 # Above allows ruff to ignore E402: module level import not at top of file
 import re
 import tempfile
 from collections import OrderedDict
@@ -43,6 +44,12 @@ from f5_tts.infer.utils_infer import (
 DEFAULT_TTS_MODEL = "F5-TTS"
 tts_model_choice = DEFAULT_TTS_MODEL
 # load models
@@ -103,8 +110,24 @@ def generate_response(messages, model, tokenizer):
 @gpu_decorator
 def infer(
-    ref_audio_orig, ref_text, gen_text, model, remove_silence, cross_fade_duration=0.15, speed=1, show_info=gr.Info
 ):
     ref_audio, ref_text = preprocess_ref_audio_text(ref_audio_orig, ref_text, show_info=show_info)
     if model == "F5-TTS":
@@ -120,7 +143,7 @@ def infer(
         global custom_ema_model, pre_custom_path
         if pre_custom_path != model[1]:
             show_info("Loading Custom TTS model...")
-            custom_ema_model = load_custom(model[1], vocab_path=model[2])
             pre_custom_path = model[1]
         ema_model = custom_ema_model
@@ -131,6 +154,7 @@ def infer(
         ema_model,
         vocoder,
         cross_fade_duration=cross_fade_duration,
         speed=speed,
         show_info=show_info,
         progress=gr.Progress(),
@@ -184,6 +208,14 @@ with gr.Blocks() as app_tts:
             step=0.1,
             info="Adjust the speed of the audio.",
         )
         cross_fade_duration_slider = gr.Slider(
             label="Cross-Fade Duration (s)",
             minimum=0.0,
@@ -203,6 +235,7 @@ with gr.Blocks() as app_tts:
         gen_text_input,
         remove_silence,
         cross_fade_duration_slider,
         speed_slider,
     ):
         audio_out, spectrogram_path, ref_text_out = infer(
@@ -211,10 +244,11 @@ with gr.Blocks() as app_tts:
             gen_text_input,
             tts_model_choice,
             remove_silence,
-            cross_fade_duration_slider,
-            speed_slider,
         )
-        return audio_out, spectrogram_path, gr.update(value=ref_text_out)
     generate_btn.click(
         basic_tts,
@@ -224,6 +258,7 @@ with gr.Blocks() as app_tts:
             gen_text_input,
             remove_silence,
             cross_fade_duration_slider,
             speed_slider,
         ],
         outputs=[audio_output, spectrogram_output, ref_text_input],
@@ -293,7 +328,7 @@ with gr.Blocks() as app_multistyle:
     )
     # Regular speech type (mandatory)
-    with gr.Row():
         with gr.Column():
             regular_name = gr.Textbox(value="Regular", label="Speech Type Name")
             regular_insert = gr.Button("Insert Label", variant="secondary")
@@ -302,12 +337,12 @@ with gr.Blocks() as app_multistyle:
     # Regular speech type (max 100)
     max_speech_types = 100
-    speech_type_rows = []  # 99
-    speech_type_names = [regular_name]  # 100
-    speech_type_audios = [regular_audio]  # 100
-    speech_type_ref_texts = [regular_ref_text]  # 100
-    speech_type_delete_btns = []  # 99
-    speech_type_insert_btns = [regular_insert]  # 100
     # Additional speech types (99 more)
     for i in range(max_speech_types - 1):
@@ -328,51 +363,32 @@ with gr.Blocks() as app_multistyle:
     # Button to add speech type
     add_speech_type_btn = gr.Button("Add Speech Type")
-    # Keep track of current number of speech types
-    speech_type_count = gr.State(value=1)
     # Function to add a speech type
-    def add_speech_type_fn(speech_type_count):
         if speech_type_count < max_speech_types:
             speech_type_count += 1
-            # Prepare updates for the rows
-            row_updates = []
-            for i in range(1, max_speech_types):
-                if i < speech_type_count:
-                    row_updates.append(gr.update(visible=True))
-                else:
-                    row_updates.append(gr.update())
         else:
-            # Optionally, show a warning
-            row_updates = [gr.update() for _ in range(1, max_speech_types)]
-        return [speech_type_count] + row_updates
-    add_speech_type_btn.click(
-        add_speech_type_fn, inputs=speech_type_count, outputs=[speech_type_count] + speech_type_rows
-    )
     # Function to delete a speech type
-    def make_delete_speech_type_fn(index):
-        def delete_speech_type_fn(speech_type_count):
-            # Prepare updates
-            row_updates = []
-            for i in range(1, max_speech_types):
-                if i == index:
-                    row_updates.append(gr.update(visible=False))
-                else:
-                    row_updates.append(gr.update())
-            speech_type_count = max(1, speech_type_count)
-            return [speech_type_count] + row_updates
-        return delete_speech_type_fn
     # Update delete button clicks
-    for i, delete_btn in enumerate(speech_type_delete_btns):
-        delete_fn = make_delete_speech_type_fn(i)
-        delete_btn.click(delete_fn, inputs=speech_type_count, outputs=[speech_type_count] + speech_type_rows)
     # Text input for the prompt
     gen_text_input_multistyle = gr.Textbox(
@@ -386,7 +402,7 @@ with gr.Blocks() as app_multistyle:
             current_text = current_text or ""
             speech_type_name = speech_type_name or "None"
             updated_text = current_text + f"{{{speech_type_name}}} "
-            return gr.update(value=updated_text)
         return insert_speech_type_fn
@@ -446,10 +462,14 @@ with gr.Blocks() as app_multistyle:
             if style in speech_types:
                 current_style = style
             else:
-                # If style not available, default to Regular
                 current_style = "Regular"
-            ref_audio = speech_types[current_style]["audio"]
             ref_text = speech_types[current_style].get("ref_text", "")
             # Generate speech for this segment
@@ -464,12 +484,10 @@ with gr.Blocks() as app_multistyle:
         # Concatenate all audio segments
         if generated_audio_segments:
             final_audio_data = np.concatenate(generated_audio_segments)
-            return [(sr, final_audio_data)] + [
-                gr.update(value=speech_types[style]["ref_text"]) for style in speech_types
-            ]
         else:
             gr.Warning("No audio generated.")
-            return [None] + [gr.update(value=speech_types[style]["ref_text"]) for style in speech_types]
     generate_multistyle_btn.click(
         generate_multistyle_speech,
@@ -487,7 +505,7 @@ with gr.Blocks() as app_multistyle:
     # Validation function to disable Generate button if speech types are missing
     def validate_speech_types(gen_text, regular_name, *args):
-        speech_type_names_list = args[:max_speech_types]
         # Collect the speech types names
         speech_types_available = set()
@@ -651,7 +669,7 @@ Have a conversation with an AI using your reference voice!
                 speed=1.0,
                 show_info=print,  # show_info=print no pull to top when generating
             )
-            return audio_result, gr.update(value=ref_text_out)
         def clear_conversation():
             """Reset the conversation"""
@@ -744,34 +762,38 @@ If you're having issues, try converting your reference audio to WAV or MP3, clip
 """
     )
-    last_used_custom = files("f5_tts").joinpath("infer/.cache/last_used_custom.txt")
     def load_last_used_custom():
         try:
-            with open(last_used_custom, "r") as f:
-                return f.read().split(",")
         except FileNotFoundError:
             last_used_custom.parent.mkdir(parents=True, exist_ok=True)
-            return [
-                "hf://SWivid/F5-TTS/F5TTS_Base/model_1200000.safetensors",
-                "hf://SWivid/F5-TTS/F5TTS_Base/vocab.txt",
-            ]
     def switch_tts_model(new_choice):
         global tts_model_choice
         if new_choice == "Custom":  # override in case webpage is refreshed
-            custom_ckpt_path, custom_vocab_path = load_last_used_custom()
-            tts_model_choice = ["Custom", custom_ckpt_path, custom_vocab_path]
-            return gr.update(visible=True, value=custom_ckpt_path), gr.update(visible=True, value=custom_vocab_path)
         else:
             tts_model_choice = new_choice
-            return gr.update(visible=False), gr.update(visible=False)
-    def set_custom_model(custom_ckpt_path, custom_vocab_path):
         global tts_model_choice
-        tts_model_choice = ["Custom", custom_ckpt_path, custom_vocab_path]
-        with open(last_used_custom, "w") as f:
-            f.write(f"{custom_ckpt_path},{custom_vocab_path}")
     with gr.Row():
         if not USING_SPACES:
@@ -783,34 +805,49 @@ If you're having issues, try converting your reference audio to WAV or MP3, clip
                 choices=[DEFAULT_TTS_MODEL, "E2-TTS"], label="Choose TTS Model", value=DEFAULT_TTS_MODEL
             )
         custom_ckpt_path = gr.Dropdown(
-            choices=["hf://SWivid/F5-TTS/F5TTS_Base/model_1200000.safetensors"],
             value=load_last_used_custom()[0],
             allow_custom_value=True,
-            label="MODEL CKPT: local_path | hf://user_id/repo_id/model_ckpt",
             visible=False,
         )
         custom_vocab_path = gr.Dropdown(
-            choices=["hf://SWivid/F5-TTS/F5TTS_Base/vocab.txt"],
             value=load_last_used_custom()[1],
             allow_custom_value=True,
-            label="VOCAB FILE: local_path | hf://user_id/repo_id/vocab_file",
             visible=False,
         )
     choose_tts_model.change(
         switch_tts_model,
         inputs=[choose_tts_model],
-        outputs=[custom_ckpt_path, custom_vocab_path],
         show_progress="hidden",
     )
     custom_ckpt_path.change(
         set_custom_model,
-        inputs=[custom_ckpt_path, custom_vocab_path],
         show_progress="hidden",
     )
     custom_vocab_path.change(
         set_custom_model,
-        inputs=[custom_ckpt_path, custom_vocab_path],
         show_progress="hidden",
     )

 # ruff: noqa: E402
 # Above allows ruff to ignore E402: module level import not at top of file
+import json
 import re
 import tempfile
 from collections import OrderedDict
 DEFAULT_TTS_MODEL = "F5-TTS"
 tts_model_choice = DEFAULT_TTS_MODEL
+DEFAULT_TTS_MODEL_CFG = [
+    "hf://SWivid/F5-TTS/F5TTS_Base/model_1200000.safetensors",
+    "hf://SWivid/F5-TTS/F5TTS_Base/vocab.txt",
+    json.dumps(dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4)),
+]
 # load models
 @gpu_decorator
 def infer(
+    ref_audio_orig,
+    ref_text,
+    gen_text,
+    model,
+    remove_silence,
+    cross_fade_duration=0.15,
+    nfe_step=32,
+    speed=1,
+    show_info=gr.Info,
 ):
+    if not ref_audio_orig:
+        gr.Warning("Please provide reference audio.")
+        return gr.update(), gr.update(), ref_text
+    if not gen_text.strip():
+        gr.Warning("Please enter text to generate.")
+        return gr.update(), gr.update(), ref_text
     ref_audio, ref_text = preprocess_ref_audio_text(ref_audio_orig, ref_text, show_info=show_info)
     if model == "F5-TTS":
         global custom_ema_model, pre_custom_path
         if pre_custom_path != model[1]:
             show_info("Loading Custom TTS model...")
+            custom_ema_model = load_custom(model[1], vocab_path=model[2], model_cfg=model[3])
             pre_custom_path = model[1]
         ema_model = custom_ema_model
         ema_model,
         vocoder,
         cross_fade_duration=cross_fade_duration,
+        nfe_step=nfe_step,
         speed=speed,
         show_info=show_info,
         progress=gr.Progress(),
             step=0.1,
             info="Adjust the speed of the audio.",
         )
+        nfe_slider = gr.Slider(
+            label="NFE Steps",
+            minimum=4,
+            maximum=64,
+            value=32,
+            step=2,
+            info="Set the number of denoising steps.",
+        )
         cross_fade_duration_slider = gr.Slider(
             label="Cross-Fade Duration (s)",
             minimum=0.0,
         gen_text_input,
         remove_silence,
         cross_fade_duration_slider,
+        nfe_slider,
         speed_slider,
     ):
         audio_out, spectrogram_path, ref_text_out = infer(
             gen_text_input,
             tts_model_choice,
             remove_silence,
+            cross_fade_duration=cross_fade_duration_slider,
+            nfe_step=nfe_slider,
+            speed=speed_slider,
         )
+        return audio_out, spectrogram_path, ref_text_out
     generate_btn.click(
         basic_tts,
             gen_text_input,
             remove_silence,
             cross_fade_duration_slider,
+            nfe_slider,
             speed_slider,
         ],
         outputs=[audio_output, spectrogram_output, ref_text_input],
     )
     # Regular speech type (mandatory)
+    with gr.Row() as regular_row:
         with gr.Column():
             regular_name = gr.Textbox(value="Regular", label="Speech Type Name")
             regular_insert = gr.Button("Insert Label", variant="secondary")
     # Regular speech type (max 100)
     max_speech_types = 100
+    speech_type_rows = [regular_row]
+    speech_type_names = [regular_name]
+    speech_type_audios = [regular_audio]
+    speech_type_ref_texts = [regular_ref_text]
+    speech_type_delete_btns = [None]
+    speech_type_insert_btns = [regular_insert]
     # Additional speech types (99 more)
     for i in range(max_speech_types - 1):
     # Button to add speech type
     add_speech_type_btn = gr.Button("Add Speech Type")
+    # Keep track of autoincrement of speech types, no roll back
+    speech_type_count = 1
     # Function to add a speech type
+    def add_speech_type_fn():
+        row_updates = [gr.update() for _ in range(max_speech_types)]
+        global speech_type_count
         if speech_type_count < max_speech_types:
+            row_updates[speech_type_count] = gr.update(visible=True)
             speech_type_count += 1
         else:
+            gr.Warning("Exhausted maximum number of speech types. Consider restart the app.")
+        return row_updates
+    add_speech_type_btn.click(add_speech_type_fn, outputs=speech_type_rows)
     # Function to delete a speech type
+    def delete_speech_type_fn():
+        return gr.update(visible=False), None, None, None
     # Update delete button clicks
+    for i in range(1, len(speech_type_delete_btns)):
+        speech_type_delete_btns[i].click(
+            delete_speech_type_fn,
+            outputs=[speech_type_rows[i], speech_type_names[i], speech_type_audios[i], speech_type_ref_texts[i]],
+        )
     # Text input for the prompt
     gen_text_input_multistyle = gr.Textbox(
             current_text = current_text or ""
             speech_type_name = speech_type_name or "None"
             updated_text = current_text + f"{{{speech_type_name}}} "
+            return updated_text
         return insert_speech_type_fn
             if style in speech_types:
                 current_style = style
             else:
+                gr.Warning(f"Type {style} is not available, will use Regular as default.")
                 current_style = "Regular"
+            try:
+                ref_audio = speech_types[current_style]["audio"]
+            except KeyError:
+                gr.Warning(f"Please provide reference audio for type {current_style}.")
+                return [None] + [speech_types[style]["ref_text"] for style in speech_types]
             ref_text = speech_types[current_style].get("ref_text", "")
             # Generate speech for this segment
         # Concatenate all audio segments
         if generated_audio_segments:
             final_audio_data = np.concatenate(generated_audio_segments)
+            return [(sr, final_audio_data)] + [speech_types[style]["ref_text"] for style in speech_types]
         else:
             gr.Warning("No audio generated.")
+            return [None] + [speech_types[style]["ref_text"] for style in speech_types]
     generate_multistyle_btn.click(
         generate_multistyle_speech,
     # Validation function to disable Generate button if speech types are missing
     def validate_speech_types(gen_text, regular_name, *args):
+        speech_type_names_list = args
         # Collect the speech types names
         speech_types_available = set()
                 speed=1.0,
                 show_info=print,  # show_info=print no pull to top when generating
             )
+            return audio_result, ref_text_out
         def clear_conversation():
             """Reset the conversation"""
 """
     )
+    last_used_custom = files("f5_tts").joinpath("infer/.cache/last_used_custom_model_info.txt")
     def load_last_used_custom():
         try:
+            custom = []
+            with open(last_used_custom, "r", encoding="utf-8") as f:
+                for line in f:
+                    custom.append(line.strip())
+            return custom
         except FileNotFoundError:
             last_used_custom.parent.mkdir(parents=True, exist_ok=True)
+            return DEFAULT_TTS_MODEL_CFG
     def switch_tts_model(new_choice):
         global tts_model_choice
         if new_choice == "Custom":  # override in case webpage is refreshed
+            custom_ckpt_path, custom_vocab_path, custom_model_cfg = load_last_used_custom()
+            tts_model_choice = ["Custom", custom_ckpt_path, custom_vocab_path, json.loads(custom_model_cfg)]
+            return (
+                gr.update(visible=True, value=custom_ckpt_path),
+                gr.update(visible=True, value=custom_vocab_path),
+                gr.update(visible=True, value=custom_model_cfg),
+            )
         else:
             tts_model_choice = new_choice
+            return gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
+    def set_custom_model(custom_ckpt_path, custom_vocab_path, custom_model_cfg):
         global tts_model_choice
+        tts_model_choice = ["Custom", custom_ckpt_path, custom_vocab_path, json.loads(custom_model_cfg)]
+        with open(last_used_custom, "w", encoding="utf-8") as f:
+            f.write(custom_ckpt_path + "\n" + custom_vocab_path + "\n" + custom_model_cfg + "\n")
     with gr.Row():
         if not USING_SPACES:
                 choices=[DEFAULT_TTS_MODEL, "E2-TTS"], label="Choose TTS Model", value=DEFAULT_TTS_MODEL
             )
         custom_ckpt_path = gr.Dropdown(
+            choices=[DEFAULT_TTS_MODEL_CFG[0]],
             value=load_last_used_custom()[0],
             allow_custom_value=True,
+            label="Model: local_path | hf://user_id/repo_id/model_ckpt",
             visible=False,
         )
         custom_vocab_path = gr.Dropdown(
+            choices=[DEFAULT_TTS_MODEL_CFG[1]],
             value=load_last_used_custom()[1],
             allow_custom_value=True,
+            label="Vocab: local_path | hf://user_id/repo_id/vocab_file",
+            visible=False,
+        )
+        custom_model_cfg = gr.Dropdown(
+            choices=[
+                DEFAULT_TTS_MODEL_CFG[2],
+                json.dumps(dict(dim=768, depth=18, heads=12, ff_mult=2, text_dim=512, conv_layers=4)),
+            ],
+            value=load_last_used_custom()[2],
+            allow_custom_value=True,
+            label="Config: in a dictionary form",
             visible=False,
         )
     choose_tts_model.change(
         switch_tts_model,
         inputs=[choose_tts_model],
+        outputs=[custom_ckpt_path, custom_vocab_path, custom_model_cfg],
         show_progress="hidden",
     )
     custom_ckpt_path.change(
         set_custom_model,
+        inputs=[custom_ckpt_path, custom_vocab_path, custom_model_cfg],
         show_progress="hidden",
     )
     custom_vocab_path.change(
         set_custom_model,
+        inputs=[custom_ckpt_path, custom_vocab_path, custom_model_cfg],
+        show_progress="hidden",
+    )
+    custom_model_cfg.change(
+        set_custom_model,
+        inputs=[custom_ckpt_path, custom_vocab_path, custom_model_cfg],
         show_progress="hidden",
     )

pyproject.toml CHANGED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "f5-tts"
-version = "0.1.2"
 description = "F5-TTS: A Fairytaler that Fakes Fluent and Faithful Speech with Flow Matching"
 readme = "README.md"
 license = {text = "MIT License"}
@@ -21,6 +21,7 @@ dependencies = [
     "datasets",
     "ema_pytorch>=0.5.2",
     "gradio>=3.45.2",
     "jieba",
     "librosa",
     "matplotlib",

 [project]
 name = "f5-tts"
+version = "0.3.1"
 description = "F5-TTS: A Fairytaler that Fakes Fluent and Faithful Speech with Flow Matching"
 readme = "README.md"
 license = {text = "MIT License"}
     "datasets",
     "ema_pytorch>=0.5.2",
     "gradio>=3.45.2",
+    "hydra-core>=1.3.0",
     "jieba",
     "librosa",
     "matplotlib",

src/f5_tts/configs/E2TTS_Base_train.yaml ADDED Viewed

	@@ -0,0 +1,44 @@

+hydra:
+  run:
+    dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
+datasets:
+  name: Emilia_ZH_EN  # dataset name
+  batch_size_per_gpu: 38400  # 8 GPUs, 8 * 38400 = 307200
+  batch_size_type: frame  # "frame" or "sample"
+  max_samples: 64  # max sequences per batch if use frame-wise batch_size. we set 32 for small models, 64 for base models
+  num_workers: 16
+optim:
+  epochs: 15
+  learning_rate: 7.5e-5
+  num_warmup_updates: 20000  # warmup steps
+  grad_accumulation_steps: 1  # note: updates = steps / grad_accumulation_steps
+  max_grad_norm: 1.0  # gradient clipping
+  bnb_optimizer: False  # use bnb 8bit AdamW optimizer or not
+model:
+  name: E2TTS_Base
+  tokenizer: pinyin
+  tokenizer_path: None  # if tokenizer = 'custom', define the path to the tokenizer you want to use (should be vocab.txt)
+  arch:
+    dim: 1024
+    depth: 24
+    heads: 16
+    ff_mult: 4
+  mel_spec:
+    target_sample_rate: 24000
+    n_mel_channels: 100
+    hop_length: 256
+    win_length: 1024
+    n_fft: 1024
+    mel_spec_type: vocos  # 'vocos' or 'bigvgan'
+  vocoder:
+    is_local: False  # use local offline ckpt or not
+    local_path: None  # local vocoder path
+ckpts:
+  logger: wandb  # wandb | tensorboard | None
+  save_per_updates: 50000  # save checkpoint per steps
+  last_per_steps: 5000  # save last checkpoint per steps
+  save_dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}

src/f5_tts/configs/E2TTS_Small_train.yaml ADDED Viewed

	@@ -0,0 +1,44 @@

+hydra:
+  run:
+    dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
+datasets:
+  name: Emilia_ZH_EN
+  batch_size_per_gpu: 38400  # 8 GPUs, 8 * 38400 = 307200
+  batch_size_type: frame  # "frame" or "sample"
+  max_samples: 64  # max sequences per batch if use frame-wise batch_size. we set 32 for small models, 64 for base models
+  num_workers: 16
+optim:
+  epochs: 15
+  learning_rate: 7.5e-5
+  num_warmup_updates: 20000  # warmup steps
+  grad_accumulation_steps: 1  # note: updates = steps / grad_accumulation_steps
+  max_grad_norm: 1.0
+  bnb_optimizer: False
+model:
+  name: E2TTS_Small
+  tokenizer: pinyin
+  tokenizer_path: None  # if tokenizer = 'custom', define the path to the tokenizer you want to use (should be vocab.txt)
+  arch:
+    dim: 768
+    depth: 20
+    heads: 12
+    ff_mult: 4
+  mel_spec:
+    target_sample_rate: 24000
+    n_mel_channels: 100
+    hop_length: 256
+    win_length: 1024
+    n_fft: 1024
+    mel_spec_type: vocos  # 'vocos' or 'bigvgan'
+  vocoder:
+    is_local: False  # use local offline ckpt or not
+    local_path: None  # local vocoder path
+ckpts:
+  logger: wandb  # wandb | tensorboard | None
+  save_per_updates: 50000  # save checkpoint per steps
+  last_per_steps: 5000  # save last checkpoint per steps
+  save_dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}

src/f5_tts/configs/F5TTS_Base_train.yaml ADDED Viewed

	@@ -0,0 +1,47 @@

+hydra:
+  run:
+    dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
+datasets:
+  name: Emilia_ZH_EN  # dataset name
+  batch_size_per_gpu: 38400  # 8 GPUs, 8 * 38400 = 307200
+  batch_size_type: frame  # "frame" or "sample"
+  max_samples: 64  # max sequences per batch if use frame-wise batch_size. we set 32 for small models, 64 for base models
+  num_workers: 16
+optim:
+  epochs: 15
+  learning_rate: 7.5e-5
+  num_warmup_updates: 20000  # warmup steps
+  grad_accumulation_steps: 1  # note: updates = steps / grad_accumulation_steps
+  max_grad_norm: 1.0  # gradient clipping
+  bnb_optimizer: False  # use bnb 8bit AdamW optimizer or not
+model:
+  name: F5TTS_Base  # model name
+  tokenizer: pinyin  # tokenizer type
+  tokenizer_path: None  # if tokenizer = 'custom', define the path to the tokenizer you want to use (should be vocab.txt)
+  arch:
+    dim: 1024
+    depth: 22
+    heads: 16
+    ff_mult: 2
+    text_dim: 512
+    conv_layers: 4
+    checkpoint_activations: False  # recompute activations and save memory for extra compute
+  mel_spec:
+    target_sample_rate: 24000
+    n_mel_channels: 100
+    hop_length: 256
+    win_length: 1024
+    n_fft: 1024
+    mel_spec_type: vocos  # 'vocos' or 'bigvgan'
+  vocoder:
+    is_local: False  # use local offline ckpt or not
+    local_path: None  # local vocoder path
+ckpts:
+  logger: wandb  # wandb | tensorboard | None
+  save_per_updates: 50000  # save checkpoint per steps
+  last_per_steps: 5000  # save last checkpoint per steps
+  save_dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}

src/f5_tts/configs/F5TTS_Small_train.yaml ADDED Viewed

	@@ -0,0 +1,47 @@

+hydra:
+  run:
+    dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
+datasets:
+  name: Emilia_ZH_EN
+  batch_size_per_gpu: 38400  # 8 GPUs, 8 * 38400 = 307200
+  batch_size_type: frame  # "frame" or "sample"
+  max_samples: 64  # max sequences per batch if use frame-wise batch_size. we set 32 for small models, 64 for base models
+  num_workers: 16
+optim:
+  epochs: 15
+  learning_rate: 7.5e-5
+  num_warmup_updates: 20000  # warmup steps
+  grad_accumulation_steps: 1  # note: updates = steps / grad_accumulation_steps
+  max_grad_norm: 1.0  # gradient clipping
+  bnb_optimizer: False  # use bnb 8bit AdamW optimizer or not
+model:
+  name: F5TTS_Small
+  tokenizer: pinyin
+  tokenizer_path: None  # if tokenizer = 'custom', define the path to the tokenizer you want to use (should be vocab.txt)
+  arch:
+    dim: 768
+    depth: 18
+    heads: 12
+    ff_mult: 2
+    text_dim: 512
+    conv_layers: 4
+    checkpoint_activations: False  # recompute activations and save memory for extra compute
+  mel_spec:
+    target_sample_rate: 24000
+    n_mel_channels: 100
+    hop_length: 256
+    win_length: 1024
+    n_fft: 1024
+    mel_spec_type: vocos  # 'vocos' or 'bigvgan'
+  vocoder:
+    is_local: False  # use local offline ckpt or not
+    local_path: None  # local vocoder path
+ckpts:
+  logger: wandb  # wandb | tensorboard | None
+  save_per_updates: 50000  # save checkpoint per steps
+  last_per_steps: 5000  # save last checkpoint per steps
+  save_dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}

src/f5_tts/eval/README.md CHANGED Viewed

@@ -39,11 +39,14 @@ Then update in the following scripts with the paths you put evaluation model ckp
 ### Objective Evaluation
-Update the path with your batch-inferenced results, and carry out WER / SIM evaluations:
 ```bash
-# Evaluation for Seed-TTS test set
-python src/f5_tts/eval/eval_seedtts_testset.py
-# Evaluation for LibriSpeech-PC test-clean (cross-sentence)
-python src/f5_tts/eval/eval_librispeech_test_clean.py
-```

 ### Objective Evaluation
+Update the path with your batch-inferenced results, and carry out WER / SIM / UTMOS evaluations:
 ```bash
+# Evaluation [WER] for Seed-TTS test [ZH] set
+python src/f5_tts/eval/eval_seedtts_testset.py --eval_task wer --lang zh --gen_wav_dir <GEN_WAV_DIR> --gpu_nums 8
+# Evaluation [SIM] for LibriSpeech-PC test-clean (cross-sentence)
+python src/f5_tts/eval/eval_librispeech_test_clean.py --eval_task sim --gen_wav_dir <GEN_WAV_DIR> --librispeech_test_clean_path <TEST_CLEAN_PATH>
+# Evaluation [UTMOS]. --ext: Audio extension
+python src/f5_tts/eval/eval_utmos.py --audio_dir <WAV_DIR> --ext wav
+```

src/f5_tts/eval/eval_infer_batch.py CHANGED Viewed

@@ -34,8 +34,6 @@ win_length = 1024
 n_fft = 1024
 target_rms = 0.1
-tokenizer = "pinyin"
 rel_path = str(files("f5_tts").joinpath("../../"))
@@ -49,6 +47,7 @@ def main():
     parser.add_argument("-n", "--expname", required=True)
     parser.add_argument("-c", "--ckptstep", default=1200000, type=int)
     parser.add_argument("-m", "--mel_spec_type", default="vocos", type=str, choices=["bigvgan", "vocos"])
     parser.add_argument("-nfe", "--nfestep", default=32, type=int)
     parser.add_argument("-o", "--odemethod", default="euler")
@@ -64,6 +63,7 @@ def main():
     ckpt_step = args.ckptstep
     ckpt_path = rel_path + f"/ckpts/{exp_name}/model_{ckpt_step}.pt"
     mel_spec_type = args.mel_spec_type
     nfe_step = args.nfestep
     ode_method = args.odemethod

 n_fft = 1024
 target_rms = 0.1
 rel_path = str(files("f5_tts").joinpath("../../"))
     parser.add_argument("-n", "--expname", required=True)
     parser.add_argument("-c", "--ckptstep", default=1200000, type=int)
     parser.add_argument("-m", "--mel_spec_type", default="vocos", type=str, choices=["bigvgan", "vocos"])
+    parser.add_argument("-to", "--tokenizer", default="pinyin", type=str, choices=["pinyin", "char"])
     parser.add_argument("-nfe", "--nfestep", default=32, type=int)
     parser.add_argument("-o", "--odemethod", default="euler")
     ckpt_step = args.ckptstep
     ckpt_path = rel_path + f"/ckpts/{exp_name}/model_{ckpt_step}.pt"
     mel_spec_type = args.mel_spec_type
+    tokenizer = args.tokenizer
     nfe_step = args.nfestep
     ode_method = args.odemethod

src/f5_tts/eval/eval_librispeech_test_clean.py CHANGED Viewed

@@ -1,7 +1,9 @@
 # Evaluate with Librispeech test-clean, ~3s prompt to generate 4-10s audio (the way of valle/voicebox evaluation)
-import sys
 import os
 sys.path.append(os.getcwd())
@@ -9,7 +11,6 @@ import multiprocessing as mp
 from importlib.resources import files
 import numpy as np
 from f5_tts.eval.utils_eval import (
     get_librispeech_test,
     run_asr_wer,
@@ -19,55 +20,77 @@ from f5_tts.eval.utils_eval import (
 rel_path = str(files("f5_tts").joinpath("../../"))
-eval_task = "wer"  # sim | wer
-lang = "en"
-metalst = rel_path + "/data/librispeech_pc_test_clean_cross_sentence.lst"
-librispeech_test_clean_path = "<SOME_PATH>/LibriSpeech/test-clean"  # test-clean path
-gen_wav_dir = "PATH_TO_GENERATED"  # generated wavs
-gpus = [0, 1, 2, 3, 4, 5, 6, 7]
-test_set = get_librispeech_test(metalst, gen_wav_dir, gpus, librispeech_test_clean_path)
-## In LibriSpeech, some speakers utilized varying voice characteristics for different characters in the book,
-## leading to a low similarity for the ground truth in some cases.
-# test_set = get_librispeech_test(metalst, gen_wav_dir, gpus, librispeech_test_clean_path, eval_ground_truth = True)  # eval ground truth
-local = False
-if local:  # use local custom checkpoint dir
-    asr_ckpt_dir = "../checkpoints/Systran/faster-whisper-large-v3"
-else:
-    asr_ckpt_dir = ""  # auto download to cache dir
-wavlm_ckpt_dir = "../checkpoints/UniSpeech/wavlm_large_finetune.pth"
-# --------------------------- WER ---------------------------
-if eval_task == "wer":
-    wers = []
-    with mp.Pool(processes=len(gpus)) as pool:
-        args = [(rank, lang, sub_test_set, asr_ckpt_dir) for (rank, sub_test_set) in test_set]
-        results = pool.map(run_asr_wer, args)
-        for wers_ in results:
-            wers.extend(wers_)
-    wer = round(np.mean(wers) * 100, 3)
-    print(f"\nTotal {len(wers)} samples")
-    print(f"WER      : {wer}%")
-# --------------------------- SIM ---------------------------
-if eval_task == "sim":
-    sim_list = []
-    with mp.Pool(processes=len(gpus)) as pool:
-        args = [(rank, sub_test_set, wavlm_ckpt_dir) for (rank, sub_test_set) in test_set]
-        results = pool.map(run_sim, args)
-        for sim_ in results:
-            sim_list.extend(sim_)
-    sim = round(sum(sim_list) / len(sim_list), 3)
-    print(f"\nTotal {len(sim_list)} samples")
-    print(f"SIM      : {sim}")

 # Evaluate with Librispeech test-clean, ~3s prompt to generate 4-10s audio (the way of valle/voicebox evaluation)
+import argparse
+import json
 import os
+import sys
 sys.path.append(os.getcwd())
 from importlib.resources import files
 import numpy as np
 from f5_tts.eval.utils_eval import (
     get_librispeech_test,
     run_asr_wer,
 rel_path = str(files("f5_tts").joinpath("../../"))
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-e", "--eval_task", type=str, default="wer", choices=["sim", "wer"])
+    parser.add_argument("-l", "--lang", type=str, default="en")
+    parser.add_argument("-g", "--gen_wav_dir", type=str, required=True)
+    parser.add_argument("-p", "--librispeech_test_clean_path", type=str, required=True)
+    parser.add_argument("-n", "--gpu_nums", type=int, default=8, help="Number of GPUs to use")
+    parser.add_argument("--local", action="store_true", help="Use local custom checkpoint directory")
+    return parser.parse_args()
+def main():
+    args = get_args()
+    eval_task = args.eval_task
+    lang = args.lang
+    librispeech_test_clean_path = args.librispeech_test_clean_path  # test-clean path
+    gen_wav_dir = args.gen_wav_dir
+    metalst = rel_path + "/data/librispeech_pc_test_clean_cross_sentence.lst"
+    gpus = list(range(args.gpu_nums))
+    test_set = get_librispeech_test(metalst, gen_wav_dir, gpus, librispeech_test_clean_path)
+    ## In LibriSpeech, some speakers utilized varying voice characteristics for different characters in the book,
+    ## leading to a low similarity for the ground truth in some cases.
+    # test_set = get_librispeech_test(metalst, gen_wav_dir, gpus, librispeech_test_clean_path, eval_ground_truth = True)  # eval ground truth
+    local = args.local
+    if local:  # use local custom checkpoint dir
+        asr_ckpt_dir = "../checkpoints/Systran/faster-whisper-large-v3"
+    else:
+        asr_ckpt_dir = ""  # auto download to cache dir
+    wavlm_ckpt_dir = "../checkpoints/UniSpeech/wavlm_large_finetune.pth"
+    # --------------------------- WER ---------------------------
+    if eval_task == "wer":
+        wer_results = []
+        wers = []
+        with mp.Pool(processes=len(gpus)) as pool:
+            args = [(rank, lang, sub_test_set, asr_ckpt_dir) for (rank, sub_test_set) in test_set]
+            results = pool.map(run_asr_wer, args)
+            for r in results:
+                wer_results.extend(r)
+        wer_result_path = f"{gen_wav_dir}/{lang}_wer_results.jsonl"
+        with open(wer_result_path, "w") as f:
+            for line in wer_results:
+                wers.append(line["wer"])
+                json_line = json.dumps(line, ensure_ascii=False)
+                f.write(json_line + "\n")
+        wer = round(np.mean(wers) * 100, 3)
+        print(f"\nTotal {len(wers)} samples")
+        print(f"WER      : {wer}%")
+        print(f"Results have been saved to {wer_result_path}")
+    # --------------------------- SIM ---------------------------
+    if eval_task == "sim":
+        sims = []
+        with mp.Pool(processes=len(gpus)) as pool:
+            args = [(rank, sub_test_set, wavlm_ckpt_dir) for (rank, sub_test_set) in test_set]
+            results = pool.map(run_sim, args)
+            for r in results:
+                sims.extend(r)
+        sim = round(sum(sims) / len(sims), 3)
+        print(f"\nTotal {len(sims)} samples")
+        print(f"SIM      : {sim}")
+if __name__ == "__main__":
+    main()

src/f5_tts/eval/eval_seedtts_testset.py CHANGED Viewed

@@ -1,7 +1,9 @@
 # Evaluate with Seed-TTS testset
-import sys
 import os
 sys.path.append(os.getcwd())
@@ -9,7 +11,6 @@ import multiprocessing as mp
 from importlib.resources import files
 import numpy as np
 from f5_tts.eval.utils_eval import (
     get_seed_tts_test,
     run_asr_wer,
@@ -19,57 +20,76 @@ from f5_tts.eval.utils_eval import (
 rel_path = str(files("f5_tts").joinpath("../../"))
-eval_task = "wer"  # sim | wer
-lang = "zh"  # zh | en
-metalst = rel_path + f"/data/seedtts_testset/{lang}/meta.lst"  # seed-tts testset
-# gen_wav_dir = rel_path + f"/data/seedtts_testset/{lang}/wavs"  # ground truth wavs
-gen_wav_dir = "PATH_TO_GENERATED"  # generated wavs
-# NOTE. paraformer-zh result will be slightly different according to the number of gpus, cuz batchsize is different
-#       zh 1.254 seems a result of 4 workers wer_seed_tts
-gpus = [0, 1, 2, 3, 4, 5, 6, 7]
-test_set = get_seed_tts_test(metalst, gen_wav_dir, gpus)
-local = False
-if local:  # use local custom checkpoint dir
-    if lang == "zh":
-        asr_ckpt_dir = "../checkpoints/funasr"  # paraformer-zh dir under funasr
-    elif lang == "en":
-        asr_ckpt_dir = "../checkpoints/Systran/faster-whisper-large-v3"
-else:
-    asr_ckpt_dir = ""  # auto download to cache dir
-wavlm_ckpt_dir = "../checkpoints/UniSpeech/wavlm_large_finetune.pth"
-# --------------------------- WER ---------------------------
-if eval_task == "wer":
-    wers = []
-    with mp.Pool(processes=len(gpus)) as pool:
-        args = [(rank, lang, sub_test_set, asr_ckpt_dir) for (rank, sub_test_set) in test_set]
-        results = pool.map(run_asr_wer, args)
-        for wers_ in results:
-            wers.extend(wers_)
-    wer = round(np.mean(wers) * 100, 3)
-    print(f"\nTotal {len(wers)} samples")
-    print(f"WER      : {wer}%")
-# --------------------------- SIM ---------------------------
-if eval_task == "sim":
-    sim_list = []
-    with mp.Pool(processes=len(gpus)) as pool:
-        args = [(rank, sub_test_set, wavlm_ckpt_dir) for (rank, sub_test_set) in test_set]
-        results = pool.map(run_sim, args)
-        for sim_ in results:
-            sim_list.extend(sim_)
-    sim = round(sum(sim_list) / len(sim_list), 3)
-    print(f"\nTotal {len(sim_list)} samples")
-    print(f"SIM      : {sim}")

 # Evaluate with Seed-TTS testset
+import argparse
+import json
 import os
+import sys
 sys.path.append(os.getcwd())
 from importlib.resources import files
 import numpy as np
 from f5_tts.eval.utils_eval import (
     get_seed_tts_test,
     run_asr_wer,
 rel_path = str(files("f5_tts").joinpath("../../"))
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-e", "--eval_task", type=str, default="wer", choices=["sim", "wer"])
+    parser.add_argument("-l", "--lang", type=str, default="en", choices=["zh", "en"])
+    parser.add_argument("-g", "--gen_wav_dir", type=str, required=True)
+    parser.add_argument("-n", "--gpu_nums", type=int, default=8, help="Number of GPUs to use")
+    parser.add_argument("--local", action="store_true", help="Use local custom checkpoint directory")
+    return parser.parse_args()
+def main():
+    args = get_args()
+    eval_task = args.eval_task
+    lang = args.lang
+    gen_wav_dir = args.gen_wav_dir
+    metalst = rel_path + f"/data/seedtts_testset/{lang}/meta.lst"  # seed-tts testset
+    # NOTE. paraformer-zh result will be slightly different according to the number of gpus, cuz batchsize is different
+    #       zh 1.254 seems a result of 4 workers wer_seed_tts
+    gpus = list(range(args.gpu_nums))
+    test_set = get_seed_tts_test(metalst, gen_wav_dir, gpus)
+    local = args.local
+    if local:  # use local custom checkpoint dir
+        if lang == "zh":
+            asr_ckpt_dir = "../checkpoints/funasr"  # paraformer-zh dir under funasr
+        elif lang == "en":
+            asr_ckpt_dir = "../checkpoints/Systran/faster-whisper-large-v3"
+    else:
+        asr_ckpt_dir = ""  # auto download to cache dir
+    wavlm_ckpt_dir = "../checkpoints/UniSpeech/wavlm_large_finetune.pth"
+    # --------------------------- WER ---------------------------
+    if eval_task == "wer":
+        wer_results = []
+        wers = []
+        with mp.Pool(processes=len(gpus)) as pool:
+            args = [(rank, lang, sub_test_set, asr_ckpt_dir) for (rank, sub_test_set) in test_set]
+            results = pool.map(run_asr_wer, args)
+            for r in results:
+                wer_results.extend(r)
+        wer_result_path = f"{gen_wav_dir}/{lang}_wer_results.jsonl"
+        with open(wer_result_path, "w") as f:
+            for line in wer_results:
+                wers.append(line["wer"])
+                json_line = json.dumps(line, ensure_ascii=False)
+                f.write(json_line + "\n")
+        wer = round(np.mean(wers) * 100, 3)
+        print(f"\nTotal {len(wers)} samples")
+        print(f"WER      : {wer}%")
+        print(f"Results have been saved to {wer_result_path}")
+    # --------------------------- SIM ---------------------------
+    if eval_task == "sim":
+        sims = []
+        with mp.Pool(processes=len(gpus)) as pool:
+            args = [(rank, sub_test_set, wavlm_ckpt_dir) for (rank, sub_test_set) in test_set]
+            results = pool.map(run_sim, args)
+            for r in results:
+                sims.extend(r)
+        sim = round(sum(sims) / len(sims), 3)
+        print(f"\nTotal {len(sims)} samples")
+        print(f"SIM      : {sim}")
+if __name__ == "__main__":
+    main()

src/f5_tts/eval/eval_utmos.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import argparse
+import json
+from pathlib import Path
+import librosa
+import torch
+from tqdm import tqdm
+def main():
+    parser = argparse.ArgumentParser(description="UTMOS Evaluation")
+    parser.add_argument("--audio_dir", type=str, required=True, help="Audio file path.")
+    parser.add_argument("--ext", type=str, default="wav", help="Audio extension.")
+    args = parser.parse_args()
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    predictor = torch.hub.load("tarepan/SpeechMOS:v1.2.0", "utmos22_strong", trust_repo=True)
+    predictor = predictor.to(device)
+    audio_paths = list(Path(args.audio_dir).rglob(f"*.{args.ext}"))
+    utmos_results = {}
+    utmos_score = 0
+    for audio_path in tqdm(audio_paths, desc="Processing"):
+        wav_name = audio_path.stem
+        wav, sr = librosa.load(audio_path, sr=None, mono=True)
+        wav_tensor = torch.from_numpy(wav).to(device).unsqueeze(0)
+        score = predictor(wav_tensor, sr)
+        utmos_results[str(wav_name)] = score.item()
+        utmos_score += score.item()
+    avg_score = utmos_score / len(audio_paths) if len(audio_paths) > 0 else 0
+    print(f"UTMOS: {avg_score}")
+    utmos_result_path = Path(args.audio_dir) / "utmos_results.json"
+    with open(utmos_result_path, "w", encoding="utf-8") as f:
+        json.dump(utmos_results, f, ensure_ascii=False, indent=4)
+    print(f"Results have been saved to {utmos_result_path}")
+if __name__ == "__main__":
+    main()

src/f5_tts/eval/utils_eval.py CHANGED Viewed

@@ -2,6 +2,7 @@ import math
 import os
 import random
 import string
 import torch
 import torch.nn.functional as F
@@ -320,7 +321,7 @@ def run_asr_wer(args):
     from zhon.hanzi import punctuation
     punctuation_all = punctuation + string.punctuation
-    wers = []
     from jiwer import compute_measures
@@ -335,8 +336,8 @@ def run_asr_wer(args):
             for segment in segments:
                 hypo = hypo + " " + segment.text
-        # raw_truth = truth
-        # raw_hypo = hypo
         for x in punctuation_all:
             truth = truth.replace(x, "")
@@ -360,9 +361,16 @@ def run_asr_wer(args):
         # dele = measures["deletions"] / len(ref_list)
         # inse = measures["insertions"] / len(ref_list)
-        wers.append(wer)
-    return wers
 # SIM Evaluation
@@ -381,7 +389,7 @@ def run_sim(args):
         model = model.cuda(device)
     model.eval()
-    sim_list = []
     for wav1, wav2, truth in tqdm(test_set):
         wav1, sr1 = torchaudio.load(wav1)
         wav2, sr2 = torchaudio.load(wav2)
@@ -400,6 +408,6 @@ def run_sim(args):
         sim = F.cosine_similarity(emb1, emb2)[0].item()
         # print(f"VSim score between two audios: {sim:.4f} (-1.0, 1.0).")
-        sim_list.append(sim)
-    return sim_list

 import os
 import random
 import string
+from pathlib import Path
 import torch
 import torch.nn.functional as F
     from zhon.hanzi import punctuation
     punctuation_all = punctuation + string.punctuation
+    wer_results = []
     from jiwer import compute_measures
             for segment in segments:
                 hypo = hypo + " " + segment.text
+        raw_truth = truth
+        raw_hypo = hypo
         for x in punctuation_all:
             truth = truth.replace(x, "")
         # dele = measures["deletions"] / len(ref_list)
         # inse = measures["insertions"] / len(ref_list)
+        wer_results.append(
+            {
+                "wav": Path(gen_wav).stem,
+                "truth": raw_truth,
+                "hypo": raw_hypo,
+                "wer": wer,
+            }
+        )
+    return wer_results
 # SIM Evaluation
         model = model.cuda(device)
     model.eval()
+    sims = []
     for wav1, wav2, truth in tqdm(test_set):
         wav1, sr1 = torchaudio.load(wav1)
         wav2, sr2 = torchaudio.load(wav2)
         sim = F.cosine_similarity(emb1, emb2)[0].item()
         # print(f"VSim score between two audios: {sim:.4f} (-1.0, 1.0).")
+        sims.append(sim)
+    return sims

src/f5_tts/infer/README.md CHANGED Viewed

@@ -12,6 +12,8 @@ To avoid possible inference failures, make sure you have seen through the follow
 - Uppercased letters will be uttered letter by letter, so use lowercased letters for normal words.
 - Add some spaces (blank: " ") or punctuations (e.g. "," ".") to explicitly introduce some pauses.
 - Preprocess numbers to Chinese letters if you want to have them read in Chinese, otherwise in English.
 ## Gradio App
@@ -62,6 +64,9 @@ f5-tts_infer-cli \
 # Choose Vocoder
 f5-tts_infer-cli --vocoder_name bigvgan --load_vocoder_from_local --ckpt_file <YOUR_CKPT_PATH, eg:ckpts/F5TTS_Base_bigvgan/model_1250000.pt>
 f5-tts_infer-cli --vocoder_name vocos --load_vocoder_from_local --ckpt_file <YOUR_CKPT_PATH, eg:ckpts/F5TTS_Base/model_1200000.safetensors>
 ```
 And a `.toml` file would help with more flexible usage.

 - Uppercased letters will be uttered letter by letter, so use lowercased letters for normal words.
 - Add some spaces (blank: " ") or punctuations (e.g. "," ".") to explicitly introduce some pauses.
 - Preprocess numbers to Chinese letters if you want to have them read in Chinese, otherwise in English.
+- If the generation output is blank (pure silence), check for ffmpeg installation (various tutorials online, blogs, videos, etc.).
+- Try turn off use_ema if using an early-stage finetuned checkpoint (which goes just few updates).
 ## Gradio App
 # Choose Vocoder
 f5-tts_infer-cli --vocoder_name bigvgan --load_vocoder_from_local --ckpt_file <YOUR_CKPT_PATH, eg:ckpts/F5TTS_Base_bigvgan/model_1250000.pt>
 f5-tts_infer-cli --vocoder_name vocos --load_vocoder_from_local --ckpt_file <YOUR_CKPT_PATH, eg:ckpts/F5TTS_Base/model_1200000.safetensors>
+# More instructions
+f5-tts_infer-cli --help
 ```
 And a `.toml` file would help with more flexible usage.

src/f5_tts/infer/SHARED.md CHANGED Viewed

@@ -16,59 +16,131 @@
 <!-- omit in toc -->
 ### Supported Languages
 - [Multilingual](#multilingual)
-    - [F5-TTS Base @ pretrain @ zh \& en](#f5-tts-base--pretrain--zh--en)
-- [Mandarin](#mandarin)
-- [Japanese](#japanese)
-    - [F5-TTS Base @ pretrain/finetune @ ja](#f5-tts-base--pretrainfinetune--ja)
 - [English](#english)
 - [French](#french)
-    - [French LibriVox @ finetune @ fr](#french-librivox--finetune--fr)
 ## Multilingual
-#### F5-TTS Base @ pretrain @ zh & en
 |Model|🤗Hugging Face|Data (Hours)|Model License|
 |:---:|:------------:|:-----------:|:-------------:|
 |F5-TTS Base|[ckpt & vocab](https://huggingface.co/SWivid/F5-TTS/tree/main/F5TTS_Base)|[Emilia 95K zh&en](https://huggingface.co/datasets/amphion/Emilia-Dataset/tree/fc71e07)|cc-by-nc-4.0|
 ```bash
-MODEL_CKPT: hf://SWivid/F5-TTS/F5TTS_Base/model_1200000.safetensors
-VOCAB_FILE: hf://SWivid/F5-TTS/F5TTS_Base/vocab.txt
 ```
 *Other infos, e.g. Author info, Github repo, Link to some sampled results, Usage instruction, Tutorial (Blog, Video, etc.) ...*
-## Mandarin
-## Japanese
-#### F5-TTS Base @ pretrain/finetune @ ja
-|Model|🤗Hugging Face|Data (Hours)|Model License|
 |:---:|:------------:|:-----------:|:-------------:|
-|F5-TTS Base|[ckpt & vocab](https://huggingface.co/Jmica/F5TTS/tree/main/JA_8500000)|[Emilia 1.7k JA](https://huggingface.co/datasets/amphion/Emilia-Dataset/tree/fc71e07) & [Galgame Dataset 5.4k](https://huggingface.co/datasets/OOPPEENN/Galgame_Dataset)|cc-by-nc-4.0|
 ```bash
-MODEL_CKPT: hf://Jmica/F5TTS/JA_8500000/model_8499660.pt
-VOCAB_FILE: hf://Jmica/F5TTS/JA_8500000/vocab_updated.txt
 ```
-## English
 ## French
-#### French LibriVox @ finetune @ fr
 |Model|🤗Hugging Face|Data (Hours)|Model License|
 |:---:|:------------:|:-----------:|:-------------:|
-|F5-TTS French|[ckpt & vocab](https://huggingface.co/RASPIAUDIO/F5-French-MixedSpeakers-reduced)|[LibriVox](https://librivox.org/)|cc-by-nc-4.0|
 ```bash
-MODEL_CKPT: hf://RASPIAUDIO/F5-French-MixedSpeakers-reduced/model_last_reduced.pt
-VOCAB_FILE: hf://RASPIAUDIO/F5-French-MixedSpeakers-reduced/vocab.txt
 ```
 - [Online Inference with Hugging Face Space](https://huggingface.co/spaces/RASPIAUDIO/f5-tts_french).
 - [Tutorial video to train a new language model](https://www.youtube.com/watch?v=UO4usaOojys).
 - [Discussion about this training can be found here](https://github.com/SWivid/F5-TTS/issues/434).

 <!-- omit in toc -->
 ### Supported Languages
 - [Multilingual](#multilingual)
+    - [F5-TTS Base @ zh \& en @ F5-TTS](#f5-tts-base--zh--en--f5-tts)
 - [English](#english)
+- [Finnish](#finnish)
+    - [F5-TTS Base @ fi @ AsmoKoskinen](#f5-tts-base--fi--asmokoskinen)
 - [French](#french)
+    - [F5-TTS Base @ fr @ RASPIAUDIO](#f5-tts-base--fr--raspiaudio)
+- [Hindi](#hindi)
+    - [F5-TTS Small @ hi @ SPRINGLab](#f5-tts-small--hi--springlab)
+- [Italian](#italian)
+    - [F5-TTS Base @ it @ alien79](#f5-tts-base--it--alien79)
+- [Japanese](#japanese)
+    - [F5-TTS Base @ ja @ Jmica](#f5-tts-base--ja--jmica)
+- [Mandarin](#mandarin)
+- [Spanish](#spanish)
+    - [F5-TTS Base @ es @ jpgallegoar](#f5-tts-base--es--jpgallegoar)
 ## Multilingual
+#### F5-TTS Base @ zh & en @ F5-TTS
 |Model|🤗Hugging Face|Data (Hours)|Model License|
 |:---:|:------------:|:-----------:|:-------------:|
 |F5-TTS Base|[ckpt & vocab](https://huggingface.co/SWivid/F5-TTS/tree/main/F5TTS_Base)|[Emilia 95K zh&en](https://huggingface.co/datasets/amphion/Emilia-Dataset/tree/fc71e07)|cc-by-nc-4.0|
 ```bash
+Model: hf://SWivid/F5-TTS/F5TTS_Base/model_1200000.safetensors
+Vocab: hf://SWivid/F5-TTS/F5TTS_Base/vocab.txt
+Config: {"dim": 1024, "depth": 22, "heads": 16, "ff_mult": 2, "text_dim": 512, "conv_layers": 4}
 ```
 *Other infos, e.g. Author info, Github repo, Link to some sampled results, Usage instruction, Tutorial (Blog, Video, etc.) ...*
+## English
+## Finnish
+#### F5-TTS Base @ fi @ AsmoKoskinen
+|Model|🤗Hugging Face|Data|Model License|
 |:---:|:------------:|:-----------:|:-------------:|
+|F5-TTS Base|[ckpt & vocab](https://huggingface.co/AsmoKoskinen/F5-TTS_Finnish_Model)|[Common Voice](https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0), [Vox Populi](https://huggingface.co/datasets/facebook/voxpopuli)|cc-by-nc-4.0|
 ```bash
+Model: hf://AsmoKoskinen/F5-TTS_Finnish_Model/model_common_voice_fi_vox_populi_fi_20241206.safetensors
+Vocab: hf://AsmoKoskinen/F5-TTS_Finnish_Model/vocab.txt
+Config: {"dim": 1024, "depth": 22, "heads": 16, "ff_mult": 2, "text_dim": 512, "conv_layers": 4}
 ```
 ## French
+#### F5-TTS Base @ fr @ RASPIAUDIO
 |Model|🤗Hugging Face|Data (Hours)|Model License|
 |:---:|:------------:|:-----------:|:-------------:|
+|F5-TTS Base|[ckpt & vocab](https://huggingface.co/RASPIAUDIO/F5-French-MixedSpeakers-reduced)|[LibriVox](https://librivox.org/)|cc-by-nc-4.0|
 ```bash
+Model: hf://RASPIAUDIO/F5-French-MixedSpeakers-reduced/model_last_reduced.pt
+Vocab: hf://RASPIAUDIO/F5-French-MixedSpeakers-reduced/vocab.txt
+Config: {"dim": 1024, "depth": 22, "heads": 16, "ff_mult": 2, "text_dim": 512, "conv_layers": 4}
 ```
 - [Online Inference with Hugging Face Space](https://huggingface.co/spaces/RASPIAUDIO/f5-tts_french).
 - [Tutorial video to train a new language model](https://www.youtube.com/watch?v=UO4usaOojys).
 - [Discussion about this training can be found here](https://github.com/SWivid/F5-TTS/issues/434).
+## Hindi
+#### F5-TTS Small @ hi @ SPRINGLab
+|Model|🤗Hugging Face|Data (Hours)|Model License|
+|:---:|:------------:|:-----------:|:-------------:|
+|F5-TTS Small|[ckpt & vocab](https://huggingface.co/SPRINGLab/F5-Hindi-24KHz)|[IndicTTS Hi](https://huggingface.co/datasets/SPRINGLab/IndicTTS-Hindi) & [IndicVoices-R Hi](https://huggingface.co/datasets/SPRINGLab/IndicVoices-R_Hindi) |cc-by-4.0|
+```bash
+Model: hf://SPRINGLab/F5-Hindi-24KHz/model_2500000.safetensors
+Vocab: hf://SPRINGLab/F5-Hindi-24KHz/vocab.txt
+Config: {"dim": 768, "depth": 18, "heads": 12, "ff_mult": 2, "text_dim": 512, "conv_layers": 4}
+```
+- Authors: SPRING Lab, Indian Institute of Technology, Madras
+- Website: https://asr.iitm.ac.in/
+## Italian
+#### F5-TTS Base @ it @ alien79
+|Model|🤗Hugging Face|Data|Model License|
+|:---:|:------------:|:-----------:|:-------------:|
+|F5-TTS Base|[ckpt & vocab](https://huggingface.co/alien79/F5-TTS-italian)|[ylacombe/cml-tts](https://huggingface.co/datasets/ylacombe/cml-tts) |cc-by-nc-4.0|
+```bash
+Model: hf://alien79/F5-TTS-italian/model_159600.safetensors
+Vocab: hf://alien79/F5-TTS-italian/vocab.txt
+Config: {"dim": 1024, "depth": 22, "heads": 16, "ff_mult": 2, "text_dim": 512, "conv_layers": 4}
+```
+- Trained by [Mithril Man](https://github.com/MithrilMan)
+- Model details on [hf project home](https://huggingface.co/alien79/F5-TTS-italian)
+- Open to collaborations to further improve the model
+## Japanese
+#### F5-TTS Base @ ja @ Jmica
+|Model|🤗Hugging Face|Data (Hours)|Model License|
+|:---:|:------------:|:-----------:|:-------------:|
+|F5-TTS Base|[ckpt & vocab](https://huggingface.co/Jmica/F5TTS/tree/main/JA_8500000)|[Emilia 1.7k JA](https://huggingface.co/datasets/amphion/Emilia-Dataset/tree/fc71e07) & [Galgame Dataset 5.4k](https://huggingface.co/datasets/OOPPEENN/Galgame_Dataset)|cc-by-nc-4.0|
+```bash
+Model: hf://Jmica/F5TTS/JA_8500000/model_8499660.pt
+Vocab: hf://Jmica/F5TTS/JA_8500000/vocab_updated.txt
+Config: {"dim": 1024, "depth": 22, "heads": 16, "ff_mult": 2, "text_dim": 512, "conv_layers": 4}
+```
+## Mandarin
+## Spanish
+#### F5-TTS Base @ es @ jpgallegoar
+|Model|🤗Hugging Face|Data (Hours)|Model License|
+|:---:|:------------:|:-----------:|:-------------:|
+|F5-TTS Base|[ckpt & vocab](https://huggingface.co/jpgallegoar/F5-Spanish)|[Voxpopuli](https://huggingface.co/datasets/facebook/voxpopuli) & Crowdsourced & TEDx, 218 hours|cc0-1.0|
+- @jpgallegoar [GitHub repo](https://github.com/jpgallegoar/Spanish-F5), Jupyter Notebook and Gradio usage for Spanish model.

src/f5_tts/infer/examples/basic/basic.toml CHANGED Viewed

@@ -8,4 +8,4 @@ gen_text = "I don't really care what you call me. I've been a silent spectator,
 gen_file = ""
 remove_silence = false
 output_dir = "tests"
-output_file = "infer_cli_out.wav"

 gen_file = ""
 remove_silence = false
 output_dir = "tests"
+output_file = "infer_cli_basic.wav"

src/f5_tts/infer/examples/multi/story.toml CHANGED Viewed

@@ -8,6 +8,7 @@ gen_text = ""
 gen_file = "infer/examples/multi/story.txt"
 remove_silence = true
 output_dir = "tests"
 [voices.town]
 ref_audio = "infer/examples/multi/town.flac"

 gen_file = "infer/examples/multi/story.txt"
 remove_silence = true
 output_dir = "tests"
+output_file = "infer_cli_story.wav"
 [voices.town]
 ref_audio = "infer/examples/multi/town.flac"

src/f5_tts/infer/infer_cli.py CHANGED Viewed

@@ -2,6 +2,7 @@ import argparse
 import codecs
 import os
 import re
 from importlib.resources import files
 from pathlib import Path
@@ -9,8 +10,17 @@ import numpy as np
 import soundfile as sf
 import tomli
 from cached_path import cached_path
 from f5_tts.infer.utils_infer import (
     infer_process,
     load_model,
     load_vocoder,
@@ -19,6 +29,7 @@ from f5_tts.infer.utils_infer import (
 )
 from f5_tts.model import DiT, UNetT
 parser = argparse.ArgumentParser(
     prog="python3 infer-cli.py",
     description="Commandline interface for E2/F5 TTS with Advanced Batch Processing.",
@@ -27,74 +38,168 @@ parser = argparse.ArgumentParser(
 parser.add_argument(
     "-c",
     "--config",
-    help="Configuration file. Default=infer/examples/basic/basic.toml",
     default=os.path.join(files("f5_tts").joinpath("infer/examples/basic"), "basic.toml"),
 )
 parser.add_argument(
     "-m",
     "--model",
-    help="F5-TTS | E2-TTS",
 )
 parser.add_argument(
     "-p",
     "--ckpt_file",
-    help="The Checkpoint .pt",
 )
 parser.add_argument(
     "-v",
     "--vocab_file",
-    help="The vocab .txt",
 )
-parser.add_argument("-r", "--ref_audio", type=str, help="Reference audio file < 15 seconds.")
-parser.add_argument("-s", "--ref_text", type=str, default="666", help="Subtitle for the reference audio.")
 parser.add_argument(
     "-t",
     "--gen_text",
     type=str,
-    help="Text to generate.",
 )
 parser.add_argument(
     "-f",
     "--gen_file",
     type=str,
-    help="File with text to generate. Ignores --gen_text",
 )
 parser.add_argument(
     "-o",
     "--output_dir",
     type=str,
-    help="Path to output folder..",
 )
 parser.add_argument(
     "-w",
     "--output_file",
     type=str,
-    help="Filename of output file..",
 )
 parser.add_argument(
     "--remove_silence",
-    help="Remove silence.",
 )
-parser.add_argument("--vocoder_name", type=str, default="vocos", choices=["vocos", "bigvgan"], help="vocoder name")
 parser.add_argument(
     "--load_vocoder_from_local",
     action="store_true",
-    help="load vocoder from local. Default: ../checkpoints/charactr/vocos-mel-24khz",
 )
 parser.add_argument(
     "--speed",
     type=float,
-    default=1.0,
-    help="Adjust the speed of the audio generation (default: 1.0)",
 )
 args = parser.parse_args()
 config = tomli.load(open(args.config, "rb"))
-ref_audio = args.ref_audio if args.ref_audio else config["ref_audio"]
-ref_text = args.ref_text if args.ref_text != "666" else config["ref_text"]
-gen_text = args.gen_text if args.gen_text else config["gen_text"]
-gen_file = args.gen_file if args.gen_file else config["gen_file"]
 # patches for pip pkg user
 if "infer/examples/" in ref_audio:
@@ -107,34 +212,39 @@ if "voices" in config:
         if "infer/examples/" in voice_ref_audio:
             config["voices"][voice]["ref_audio"] = str(files("f5_tts").joinpath(f"{voice_ref_audio}"))
 if gen_file:
     gen_text = codecs.open(gen_file, "r", "utf-8").read()
-output_dir = args.output_dir if args.output_dir else config["output_dir"]
-output_file = args.output_file if args.output_file else config["output_file"]
-model = args.model if args.model else config["model"]
-ckpt_file = args.ckpt_file if args.ckpt_file else ""
-vocab_file = args.vocab_file if args.vocab_file else ""
-remove_silence = args.remove_silence if args.remove_silence else config["remove_silence"]
-speed = args.speed
 wave_path = Path(output_dir) / output_file
 # spectrogram_path = Path(output_dir) / "infer_cli_out.png"
-vocoder_name = args.vocoder_name
-mel_spec_type = args.vocoder_name
 if vocoder_name == "vocos":
     vocoder_local_path = "../checkpoints/vocos-mel-24khz"
 elif vocoder_name == "bigvgan":
     vocoder_local_path = "../checkpoints/bigvgan_v2_24khz_100band_256x"
-vocoder = load_vocoder(vocoder_name=mel_spec_type, is_local=args.load_vocoder_from_local, local_path=vocoder_local_path)
-# load models
 if model == "F5-TTS":
     model_cls = DiT
-    model_cfg = dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4)
-    if ckpt_file == "":
         if vocoder_name == "vocos":
             repo_name = "F5-TTS"
             exp_name = "F5TTS_Base"
@@ -148,22 +258,25 @@ if model == "F5-TTS":
             ckpt_file = str(cached_path(f"hf://SWivid/{repo_name}/{exp_name}/model_{ckpt_step}.pt"))
 elif model == "E2-TTS":
-    assert vocoder_name == "vocos", "E2-TTS only supports vocoder vocos"
     model_cls = UNetT
     model_cfg = dict(dim=1024, depth=24, heads=16, ff_mult=4)
-    if ckpt_file == "":
         repo_name = "E2-TTS"
         exp_name = "E2TTS_Base"
         ckpt_step = 1200000
         ckpt_file = str(cached_path(f"hf://SWivid/{repo_name}/{exp_name}/model_{ckpt_step}.safetensors"))
         # ckpt_file = f"ckpts/{exp_name}/model_{ckpt_step}.pt"  # .pt | .safetensors; local path
 print(f"Using {model}...")
-ema_model = load_model(model_cls, model_cfg, ckpt_file, mel_spec_type=mel_spec_type, vocab_file=vocab_file)
-def main_process(ref_audio, ref_text, text_gen, model_obj, mel_spec_type, remove_silence, speed):
     main_voice = {"ref_audio": ref_audio, "ref_text": ref_text}
     if "voices" not in config:
         voices = {"main": main_voice}
@@ -171,16 +284,16 @@ def main_process(ref_audio, ref_text, text_gen, model_obj, mel_spec_type, remove
         voices = config["voices"]
         voices["main"] = main_voice
     for voice in voices:
         voices[voice]["ref_audio"], voices[voice]["ref_text"] = preprocess_ref_audio_text(
             voices[voice]["ref_audio"], voices[voice]["ref_text"]
         )
-        print("Voice:", voice)
-        print("Ref_audio:", voices[voice]["ref_audio"])
-        print("Ref_text:", voices[voice]["ref_text"])
     generated_audio_segments = []
     reg1 = r"(?=\[\w+\])"
-    chunks = re.split(reg1, text_gen)
     reg2 = r"\[(\w+)\]"
     for text in chunks:
         if not text.strip():
@@ -195,14 +308,35 @@ def main_process(ref_audio, ref_text, text_gen, model_obj, mel_spec_type, remove
             print(f"Voice {voice} not found, using main.")
             voice = "main"
         text = re.sub(reg2, "", text)
-        gen_text = text.strip()
-        ref_audio = voices[voice]["ref_audio"]
-        ref_text = voices[voice]["ref_text"]
         print(f"Voice: {voice}")
-        audio, final_sample_rate, spectragram = infer_process(
-            ref_audio, ref_text, gen_text, model_obj, vocoder, mel_spec_type=mel_spec_type, speed=speed
         )
-        generated_audio_segments.append(audio)
     if generated_audio_segments:
         final_wave = np.concatenate(generated_audio_segments)
@@ -218,9 +352,5 @@ def main_process(ref_audio, ref_text, text_gen, model_obj, mel_spec_type, remove
             print(f.name)
-def main():
-    main_process(ref_audio, ref_text, gen_text, ema_model, mel_spec_type, remove_silence, speed)
 if __name__ == "__main__":
     main()

 import codecs
 import os
 import re
+from datetime import datetime
 from importlib.resources import files
 from pathlib import Path
 import soundfile as sf
 import tomli
 from cached_path import cached_path
+from omegaconf import OmegaConf
 from f5_tts.infer.utils_infer import (
+    mel_spec_type,
+    target_rms,
+    cross_fade_duration,
+    nfe_step,
+    cfg_strength,
+    sway_sampling_coef,
+    speed,
+    fix_duration,
     infer_process,
     load_model,
     load_vocoder,
 )
 from f5_tts.model import DiT, UNetT
 parser = argparse.ArgumentParser(
     prog="python3 infer-cli.py",
     description="Commandline interface for E2/F5 TTS with Advanced Batch Processing.",
 parser.add_argument(
     "-c",
     "--config",
+    type=str,
     default=os.path.join(files("f5_tts").joinpath("infer/examples/basic"), "basic.toml"),
+    help="The configuration file, default see infer/examples/basic/basic.toml",
 )
+# Note. Not to provide default value here in order to read default from config file
 parser.add_argument(
     "-m",
     "--model",
+    type=str,
+    help="The model name: F5-TTS | E2-TTS",
+)
+parser.add_argument(
+    "-mc",
+    "--model_cfg",
+    type=str,
+    help="The path to F5-TTS model config file .yaml",
 )
 parser.add_argument(
     "-p",
     "--ckpt_file",
+    type=str,
+    help="The path to model checkpoint .pt, leave blank to use default",
 )
 parser.add_argument(
     "-v",
     "--vocab_file",
+    type=str,
+    help="The path to vocab file .txt, leave blank to use default",
+)
+parser.add_argument(
+    "-r",
+    "--ref_audio",
+    type=str,
+    help="The reference audio file.",
+)
+parser.add_argument(
+    "-s",
+    "--ref_text",
+    type=str,
+    help="The transcript/subtitle for the reference audio",
 )
 parser.add_argument(
     "-t",
     "--gen_text",
     type=str,
+    help="The text to make model synthesize a speech",
 )
 parser.add_argument(
     "-f",
     "--gen_file",
     type=str,
+    help="The file with text to generate, will ignore --gen_text",
 )
 parser.add_argument(
     "-o",
     "--output_dir",
     type=str,
+    help="The path to output folder",
 )
 parser.add_argument(
     "-w",
     "--output_file",
     type=str,
+    help="The name of output file",
+)
+parser.add_argument(
+    "--save_chunk",
+    action="store_true",
+    help="To save each audio chunks during inference",
 )
 parser.add_argument(
     "--remove_silence",
+    action="store_true",
+    help="To remove long silence found in ouput",
 )
 parser.add_argument(
     "--load_vocoder_from_local",
     action="store_true",
+    help="To load vocoder from local dir, default to ../checkpoints/vocos-mel-24khz",
+)
+parser.add_argument(
+    "--vocoder_name",
+    type=str,
+    choices=["vocos", "bigvgan"],
+    help=f"Used vocoder name: vocos | bigvgan, default {mel_spec_type}",
+)
+parser.add_argument(
+    "--target_rms",
+    type=float,
+    help=f"Target output speech loudness normalization value, default {target_rms}",
+)
+parser.add_argument(
+    "--cross_fade_duration",
+    type=float,
+    help=f"Duration of cross-fade between audio segments in seconds, default {cross_fade_duration}",
+)
+parser.add_argument(
+    "--nfe_step",
+    type=int,
+    help=f"The number of function evaluation (denoising steps), default {nfe_step}",
+)
+parser.add_argument(
+    "--cfg_strength",
+    type=float,
+    help=f"Classifier-free guidance strength, default {cfg_strength}",
+)
+parser.add_argument(
+    "--sway_sampling_coef",
+    type=float,
+    help=f"Sway Sampling coefficient, default {sway_sampling_coef}",
 )
 parser.add_argument(
     "--speed",
     type=float,
+    help=f"The speed of the generated audio, default {speed}",
+)
+parser.add_argument(
+    "--fix_duration",
+    type=float,
+    help=f"Fix the total duration (ref and gen audios) in seconds, default {fix_duration}",
 )
 args = parser.parse_args()
+# config file
 config = tomli.load(open(args.config, "rb"))
+# command-line interface parameters
+model = args.model or config.get("model", "F5-TTS")
+model_cfg = args.model_cfg or config.get("model_cfg", str(files("f5_tts").joinpath("configs/F5TTS_Base_train.yaml")))
+ckpt_file = args.ckpt_file or config.get("ckpt_file", "")
+vocab_file = args.vocab_file or config.get("vocab_file", "")
+ref_audio = args.ref_audio or config.get("ref_audio", "infer/examples/basic/basic_ref_en.wav")
+ref_text = args.ref_text or config.get("ref_text", "Some call me nature, others call me mother nature.")
+gen_text = args.gen_text or config.get("gen_text", "Here we generate something just for test.")
+gen_file = args.gen_file or config.get("gen_file", "")
+output_dir = args.output_dir or config.get("output_dir", "tests")
+output_file = args.output_file or config.get(
+    "output_file", f"infer_cli_{datetime.now().strftime(r'%Y%m%d_%H%M%S')}.wav"
+)
+save_chunk = args.save_chunk or config.get("save_chunk", False)
+remove_silence = args.remove_silence or config.get("remove_silence", False)
+load_vocoder_from_local = args.load_vocoder_from_local or config.get("load_vocoder_from_local", False)
+vocoder_name = args.vocoder_name or config.get("vocoder_name", mel_spec_type)
+target_rms = args.target_rms or config.get("target_rms", target_rms)
+cross_fade_duration = args.cross_fade_duration or config.get("cross_fade_duration", cross_fade_duration)
+nfe_step = args.nfe_step or config.get("nfe_step", nfe_step)
+cfg_strength = args.cfg_strength or config.get("cfg_strength", cfg_strength)
+sway_sampling_coef = args.sway_sampling_coef or config.get("sway_sampling_coef", sway_sampling_coef)
+speed = args.speed or config.get("speed", speed)
+fix_duration = args.fix_duration or config.get("fix_duration", fix_duration)
 # patches for pip pkg user
 if "infer/examples/" in ref_audio:
         if "infer/examples/" in voice_ref_audio:
             config["voices"][voice]["ref_audio"] = str(files("f5_tts").joinpath(f"{voice_ref_audio}"))
+# ignore gen_text if gen_file provided
 if gen_file:
     gen_text = codecs.open(gen_file, "r", "utf-8").read()
+# output path
 wave_path = Path(output_dir) / output_file
 # spectrogram_path = Path(output_dir) / "infer_cli_out.png"
+if save_chunk:
+    output_chunk_dir = os.path.join(output_dir, f"{Path(output_file).stem}_chunks")
+    if not os.path.exists(output_chunk_dir):
+        os.makedirs(output_chunk_dir)
+# load vocoder
 if vocoder_name == "vocos":
     vocoder_local_path = "../checkpoints/vocos-mel-24khz"
 elif vocoder_name == "bigvgan":
     vocoder_local_path = "../checkpoints/bigvgan_v2_24khz_100band_256x"
+vocoder = load_vocoder(vocoder_name=vocoder_name, is_local=load_vocoder_from_local, local_path=vocoder_local_path)
+# load TTS model
 if model == "F5-TTS":
     model_cls = DiT
+    model_cfg = OmegaConf.load(model_cfg).model.arch
+    if not ckpt_file:  # path not specified, download from repo
         if vocoder_name == "vocos":
             repo_name = "F5-TTS"
             exp_name = "F5TTS_Base"
             ckpt_file = str(cached_path(f"hf://SWivid/{repo_name}/{exp_name}/model_{ckpt_step}.pt"))
 elif model == "E2-TTS":
+    assert args.model_cfg is None, "E2-TTS does not support custom model_cfg yet"
+    assert vocoder_name == "vocos", "E2-TTS only supports vocoder vocos yet"
     model_cls = UNetT
     model_cfg = dict(dim=1024, depth=24, heads=16, ff_mult=4)
+    if not ckpt_file:  # path not specified, download from repo
         repo_name = "E2-TTS"
         exp_name = "E2TTS_Base"
         ckpt_step = 1200000
         ckpt_file = str(cached_path(f"hf://SWivid/{repo_name}/{exp_name}/model_{ckpt_step}.safetensors"))
         # ckpt_file = f"ckpts/{exp_name}/model_{ckpt_step}.pt"  # .pt | .safetensors; local path
 print(f"Using {model}...")
+ema_model = load_model(model_cls, model_cfg, ckpt_file, mel_spec_type=vocoder_name, vocab_file=vocab_file)
+# inference process
+def main():
     main_voice = {"ref_audio": ref_audio, "ref_text": ref_text}
     if "voices" not in config:
         voices = {"main": main_voice}
         voices = config["voices"]
         voices["main"] = main_voice
     for voice in voices:
+        print("Voice:", voice)
+        print("ref_audio ", voices[voice]["ref_audio"])
         voices[voice]["ref_audio"], voices[voice]["ref_text"] = preprocess_ref_audio_text(
             voices[voice]["ref_audio"], voices[voice]["ref_text"]
         )
+        print("ref_audio_", voices[voice]["ref_audio"], "\n\n")
     generated_audio_segments = []
     reg1 = r"(?=\[\w+\])"
+    chunks = re.split(reg1, gen_text)
     reg2 = r"\[(\w+)\]"
     for text in chunks:
         if not text.strip():
             print(f"Voice {voice} not found, using main.")
             voice = "main"
         text = re.sub(reg2, "", text)
+        ref_audio_ = voices[voice]["ref_audio"]
+        ref_text_ = voices[voice]["ref_text"]
+        gen_text_ = text.strip()
         print(f"Voice: {voice}")
+        audio_segment, final_sample_rate, spectragram = infer_process(
+            ref_audio_,
+            ref_text_,
+            gen_text_,
+            ema_model,
+            vocoder,
+            mel_spec_type=vocoder_name,
+            target_rms=target_rms,
+            cross_fade_duration=cross_fade_duration,
+            nfe_step=nfe_step,
+            cfg_strength=cfg_strength,
+            sway_sampling_coef=sway_sampling_coef,
+            speed=speed,
+            fix_duration=fix_duration,
         )
+        generated_audio_segments.append(audio_segment)
+        if save_chunk:
+            if len(gen_text_) > 200:
+                gen_text_ = gen_text_[:200] + " ... "
+            sf.write(
+                os.path.join(output_chunk_dir, f"{len(generated_audio_segments)-1}_{gen_text_}.wav"),
+                audio_segment,
+                final_sample_rate,
+            )
     if generated_audio_segments:
         final_wave = np.concatenate(generated_audio_segments)
             print(f.name)
 if __name__ == "__main__":
     main()

src/f5_tts/infer/utils_infer.py CHANGED Viewed

@@ -138,7 +138,11 @@ asr_pipe = None
 def initialize_asr_pipeline(device: str = device, dtype=None):
     if dtype is None:
         dtype = (
-            torch.float16 if "cuda" in device and torch.cuda.get_device_properties(device).major >= 6 else torch.float32
         )
     global asr_pipe
     asr_pipe = pipeline(
@@ -171,7 +175,11 @@ def transcribe(ref_audio, language=None):
 def load_checkpoint(model, ckpt_path, device: str, dtype=None, use_ema=True):
     if dtype is None:
         dtype = (
-            torch.float16 if "cuda" in device and torch.cuda.get_device_properties(device).major >= 6 else torch.float32
         )
     model = model.to(dtype)
@@ -338,7 +346,7 @@ def preprocess_ref_audio_text(ref_audio_orig, ref_text, clip_short=True, show_in
         else:
             ref_text += ". "
-    print("ref_text  ", ref_text)
     return ref_audio, ref_text
@@ -370,6 +378,7 @@ def infer_process(
     gen_text_batches = chunk_text(gen_text, max_chars=max_chars)
     for i, gen_text in enumerate(gen_text_batches):
         print(f"gen_text {i}", gen_text)
     show_info(f"Generating audio in {len(gen_text_batches)} batches...")
     return infer_batch_process(

 def initialize_asr_pipeline(device: str = device, dtype=None):
     if dtype is None:
         dtype = (
+            torch.float16
+            if "cuda" in device
+            and torch.cuda.get_device_properties(device).major >= 6
+            and not torch.cuda.get_device_name().endswith("[ZLUDA]")
+            else torch.float32
         )
     global asr_pipe
     asr_pipe = pipeline(
 def load_checkpoint(model, ckpt_path, device: str, dtype=None, use_ema=True):
     if dtype is None:
         dtype = (
+            torch.float16
+            if "cuda" in device
+            and torch.cuda.get_device_properties(device).major >= 6
+            and not torch.cuda.get_device_name().endswith("[ZLUDA]")
+            else torch.float32
         )
     model = model.to(dtype)
         else:
             ref_text += ". "
+    print("\nref_text  ", ref_text)
     return ref_audio, ref_text
     gen_text_batches = chunk_text(gen_text, max_chars=max_chars)
     for i, gen_text in enumerate(gen_text_batches):
         print(f"gen_text {i}", gen_text)
+    print("\n")
     show_info(f"Generating audio in {len(gen_text_batches)} batches...")
     return infer_batch_process(

src/f5_tts/model/backbones/dit.py CHANGED Viewed

@@ -105,6 +105,7 @@ class DiT(nn.Module):
         text_dim=None,
         conv_layers=0,
         long_skip_connection=False,
     ):
         super().__init__()
@@ -127,6 +128,16 @@ class DiT(nn.Module):
         self.norm_out = AdaLayerNormZero_Final(dim)  # final modulation
         self.proj_out = nn.Linear(dim, mel_dim)
     def forward(
         self,
         x: float["b n d"],  # nosied input audio  # noqa: F722
@@ -152,7 +163,10 @@ class DiT(nn.Module):
             residual = x
         for block in self.transformer_blocks:
-            x = block(x, t, mask=mask, rope=rope)
         if self.long_skip_connection is not None:
             x = self.long_skip_connection(torch.cat((x, residual), dim=-1))

         text_dim=None,
         conv_layers=0,
         long_skip_connection=False,
+        checkpoint_activations=False,
     ):
         super().__init__()
         self.norm_out = AdaLayerNormZero_Final(dim)  # final modulation
         self.proj_out = nn.Linear(dim, mel_dim)
+        self.checkpoint_activations = checkpoint_activations
+    def ckpt_wrapper(self, module):
+        # https://github.com/chuanyangjin/fast-DiT/blob/main/models.py
+        def ckpt_forward(*inputs):
+            outputs = module(*inputs)
+            return outputs
+        return ckpt_forward
     def forward(
         self,
         x: float["b n d"],  # nosied input audio  # noqa: F722
             residual = x
         for block in self.transformer_blocks:
+            if self.checkpoint_activations:
+                x = torch.utils.checkpoint.checkpoint(self.ckpt_wrapper(block), x, t, mask, rope)
+            else:
+                x = block(x, t, mask=mask, rope=rope)
         if self.long_skip_connection is not None:
             x = self.long_skip_connection(torch.cat((x, residual), dim=-1))

src/f5_tts/model/trainer.py CHANGED Viewed

@@ -315,7 +315,7 @@ class Trainer:
                     self.scheduler.step()
                     self.optimizer.zero_grad()
-                if self.is_main:
                     self.ema_model.update()
                 global_step += 1

                     self.scheduler.step()
                     self.optimizer.zero_grad()
+                if self.is_main and self.accelerator.sync_gradients:
                     self.ema_model.update()
                 global_step += 1

src/f5_tts/model/utils.py CHANGED Viewed

@@ -133,16 +133,23 @@ def get_tokenizer(dataset_name, tokenizer: str = "pinyin"):
 # convert char to pinyin
 def convert_char_to_pinyin(text_list, polyphone=True):
     final_text_list = []
-    god_knows_why_en_testset_contains_zh_quote = str.maketrans(
-        {"“": '"', "”": '"', "‘": "'", "’": "'"}
-    )  # in case librispeech (orig no-pc) test-clean
-    custom_trans = str.maketrans({";": ","})  # add custom trans here, to address oov
     for text in text_list:
         char_list = []
-        text = text.translate(god_knows_why_en_testset_contains_zh_quote)
         text = text.translate(custom_trans)
         for seg in jieba.cut(text):
             seg_byte_len = len(bytes(seg, "UTF-8"))
@@ -150,22 +157,21 @@ def convert_char_to_pinyin(text_list, polyphone=True):
                 if char_list and seg_byte_len > 1 and char_list[-1] not in " :'\"":
                     char_list.append(" ")
                 char_list.extend(seg)
-            elif polyphone and seg_byte_len == 3 * len(seg):  # if pure chinese characters
-                seg = lazy_pinyin(seg, style=Style.TONE3, tone_sandhi=True)
-                for c in seg:
-                    if c not in "。，、；：？！《》【】—…":
                         char_list.append(" ")
-                    char_list.append(c)
-            else:  # if mixed chinese characters, alphabets and symbols
                 for c in seg:
                     if ord(c) < 256:
                         char_list.extend(c)
                     else:
-                        if c not in "。，、；：？！《》【】—…":
-                            char_list.append(" ")
-                            char_list.extend(lazy_pinyin(c, style=Style.TONE3, tone_sandhi=True))
-                        else:  # if is zh punc
-                            char_list.append(c)
         final_text_list.append(char_list)
     return final_text_list

 # convert char to pinyin
+jieba.initialize()
+print("Word segmentation module jieba initialized.\n")
 def convert_char_to_pinyin(text_list, polyphone=True):
     final_text_list = []
+    custom_trans = str.maketrans(
+        {";": ",", "“": '"', "”": '"', "‘": "'", "’": "'"}
+    )  # add custom trans here, to address oov
+    def is_chinese(c):
+        return (
+            "\u3100" <= c <= "\u9fff"  # common chinese characters
+        )
     for text in text_list:
         char_list = []
         text = text.translate(custom_trans)
         for seg in jieba.cut(text):
             seg_byte_len = len(bytes(seg, "UTF-8"))
                 if char_list and seg_byte_len > 1 and char_list[-1] not in " :'\"":
                     char_list.append(" ")
                 char_list.extend(seg)
+            elif polyphone and seg_byte_len == 3 * len(seg):  # if pure east asian characters
+                seg_ = lazy_pinyin(seg, style=Style.TONE3, tone_sandhi=True)
+                for i, c in enumerate(seg):
+                    if is_chinese(c):
                         char_list.append(" ")
+                    char_list.append(seg_[i])
+            else:  # if mixed characters, alphabets and symbols
                 for c in seg:
                     if ord(c) < 256:
                         char_list.extend(c)
+                    elif is_chinese(c):
+                        char_list.append(" ")
+                        char_list.extend(lazy_pinyin(c, style=Style.TONE3, tone_sandhi=True))
                     else:
+                        char_list.append(c)
         final_text_list.append(char_list)
     return final_text_list

src/f5_tts/train/README.md CHANGED Viewed

@@ -2,9 +2,9 @@
 ## Prepare Dataset
-Example data processing scripts for Emilia and Wenetspeech4TTS, and you may tailor your own one along with a Dataset class in `src/f5_tts/model/dataset.py`.
-### 1. Datasets used for pretrained models
 Download corresponding dataset first, and fill in the path in scripts.
 ```bash
@@ -16,6 +16,9 @@ python src/f5_tts/train/datasets/prepare_wenetspeech4tts.py
 # Prepare the LibriTTS dataset
 python src/f5_tts/train/datasets/prepare_libritts.py
 ```
 ### 2. Create custom dataset with metadata.csv
@@ -35,7 +38,12 @@ Once your datasets are prepared, you can start the training process.
 # setup accelerate config, e.g. use multi-gpu ddp, fp16
 # will be to: ~/.cache/huggingface/accelerate/default_config.yaml
 accelerate config
-accelerate launch src/f5_tts/train/train.py
 ```
 ### 2. Finetuning practice
@@ -43,6 +51,8 @@ Discussion board for Finetuning [#57](https://github.com/SWivid/F5-TTS/discussio
 Gradio UI training/finetuning with `src/f5_tts/train/finetune_gradio.py` see [#143](https://github.com/SWivid/F5-TTS/discussions/143).
 ### 3. Wandb Logging
 The `wandb/` dir will be created under path you run training/finetuning scripts.

 ## Prepare Dataset
+Example data processing scripts, and you may tailor your own one along with a Dataset class in `src/f5_tts/model/dataset.py`.
+### 1. Some specific Datasets preparing scripts
 Download corresponding dataset first, and fill in the path in scripts.
 ```bash
 # Prepare the LibriTTS dataset
 python src/f5_tts/train/datasets/prepare_libritts.py
+# Prepare the LJSpeech dataset
+python src/f5_tts/train/datasets/prepare_ljspeech.py
 ```
 ### 2. Create custom dataset with metadata.csv
 # setup accelerate config, e.g. use multi-gpu ddp, fp16
 # will be to: ~/.cache/huggingface/accelerate/default_config.yaml
 accelerate config
+# .yaml files are under src/f5_tts/configs directory
+accelerate launch src/f5_tts/train/train.py --config-name F5TTS_Base_train.yaml
+# possible to overwrite accelerate and hydra config
+accelerate launch --mixed_precision=fp16 src/f5_tts/train/train.py --config-name F5TTS_Small_train.yaml ++datasets.batch_size_per_gpu=19200
 ```
 ### 2. Finetuning practice
 Gradio UI training/finetuning with `src/f5_tts/train/finetune_gradio.py` see [#143](https://github.com/SWivid/F5-TTS/discussions/143).
+The `use_ema = True` is harmful for early-stage finetuned checkpoints (which goes just few updates, thus ema weights still dominated by pretrained ones), try turn it off and see if provide better results.
 ### 3. Wandb Logging
 The `wandb/` dir will be created under path you run training/finetuning scripts.

src/f5_tts/train/datasets/prepare_ljspeech.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import os
+import sys
+sys.path.append(os.getcwd())
+import json
+from importlib.resources import files
+from pathlib import Path
+from tqdm import tqdm
+import soundfile as sf
+from datasets.arrow_writer import ArrowWriter
+def main():
+    result = []
+    duration_list = []
+    text_vocab_set = set()
+    with open(meta_info, "r") as f:
+        lines = f.readlines()
+        for line in tqdm(lines):
+            uttr, text, norm_text = line.split("|")
+            norm_text = norm_text.strip()
+            wav_path = Path(dataset_dir) / "wavs" / f"{uttr}.wav"
+            duration = sf.info(wav_path).duration
+            if duration < 0.4 or duration > 30:
+                continue
+            result.append({"audio_path": str(wav_path), "text": norm_text, "duration": duration})
+            duration_list.append(duration)
+            text_vocab_set.update(list(norm_text))
+    # save preprocessed dataset to disk
+    if not os.path.exists(f"{save_dir}"):
+        os.makedirs(f"{save_dir}")
+    print(f"\nSaving to {save_dir} ...")
+    with ArrowWriter(path=f"{save_dir}/raw.arrow") as writer:
+        for line in tqdm(result, desc="Writing to raw.arrow ..."):
+            writer.write(line)
+    # dup a json separately saving duration in case for DynamicBatchSampler ease
+    with open(f"{save_dir}/duration.json", "w", encoding="utf-8") as f:
+        json.dump({"duration": duration_list}, f, ensure_ascii=False)
+    # vocab map, i.e. tokenizer
+    # add alphabets and symbols (optional, if plan to ft on de/fr etc.)
+    with open(f"{save_dir}/vocab.txt", "w") as f:
+        for vocab in sorted(text_vocab_set):
+            f.write(vocab + "\n")
+    print(f"\nFor {dataset_name}, sample count: {len(result)}")
+    print(f"For {dataset_name}, vocab size is: {len(text_vocab_set)}")
+    print(f"For {dataset_name}, total {sum(duration_list)/3600:.2f} hours")
+if __name__ == "__main__":
+    tokenizer = "char"  # "pinyin" | "char"
+    dataset_dir = "<SOME_PATH>/LJSpeech-1.1"
+    dataset_name = f"LJSpeech_{tokenizer}"
+    meta_info = os.path.join(dataset_dir, "metadata.csv")
+    save_dir = str(files("f5_tts").joinpath("../../")) + f"/data/{dataset_name}"
+    print(f"\nPrepare for {dataset_name}, will save to {save_dir}\n")
+    main()

src/f5_tts/train/train.py CHANGED Viewed

@@ -1,100 +1,72 @@
 # training script.
 from importlib.resources import files
 from f5_tts.model import CFM, DiT, Trainer, UNetT
 from f5_tts.model.dataset import load_dataset
 from f5_tts.model.utils import get_tokenizer
-# -------------------------- Dataset Settings --------------------------- #
-target_sample_rate = 24000
-n_mel_channels = 100
-hop_length = 256
-win_length = 1024
-n_fft = 1024
-mel_spec_type = "vocos"  # 'vocos' or 'bigvgan'
-tokenizer = "pinyin"  # 'pinyin', 'char', or 'custom'
-tokenizer_path = None  # if tokenizer = 'custom', define the path to the tokenizer you want to use (should be vocab.txt)
-dataset_name = "Emilia_ZH_EN"
-# -------------------------- Training Settings -------------------------- #
-exp_name = "F5TTS_Base"  # F5TTS_Base | E2TTS_Base
-learning_rate = 7.5e-5
-batch_size_per_gpu = 38400  # 8 GPUs, 8 * 38400 = 307200
-batch_size_type = "frame"  # "frame" or "sample"
-max_samples = 64  # max sequences per batch if use frame-wise batch_size. we set 32 for small models, 64 for base models
-grad_accumulation_steps = 1  # note: updates = steps / grad_accumulation_steps
-max_grad_norm = 1.0
-epochs = 11  # use linear decay, thus epochs control the slope
-num_warmup_updates = 20000  # warmup steps
-save_per_updates = 50000  # save checkpoint per steps
-last_per_steps = 5000  # save last checkpoint per steps
-# model params
-if exp_name == "F5TTS_Base":
-    wandb_resume_id = None
-    model_cls = DiT
-    model_cfg = dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4)
-elif exp_name == "E2TTS_Base":
-    wandb_resume_id = None
-    model_cls = UNetT
-    model_cfg = dict(dim=1024, depth=24, heads=16, ff_mult=4)
-# ----------------------------------------------------------------------- #
-def main():
-    if tokenizer == "custom":
-        tokenizer_path = tokenizer_path
     else:
-        tokenizer_path = dataset_name
     vocab_char_map, vocab_size = get_tokenizer(tokenizer_path, tokenizer)
-    mel_spec_kwargs = dict(
-        n_fft=n_fft,
-        hop_length=hop_length,
-        win_length=win_length,
-        n_mel_channels=n_mel_channels,
-        target_sample_rate=target_sample_rate,
-        mel_spec_type=mel_spec_type,
-    )
     model = CFM(
-        transformer=model_cls(**model_cfg, text_num_embeds=vocab_size, mel_dim=n_mel_channels),
-        mel_spec_kwargs=mel_spec_kwargs,
         vocab_char_map=vocab_char_map,
     )
     trainer = Trainer(
         model,
-        epochs,
-        learning_rate,
-        num_warmup_updates=num_warmup_updates,
-        save_per_updates=save_per_updates,
-        checkpoint_path=str(files("f5_tts").joinpath(f"../../ckpts/{exp_name}")),
-        batch_size=batch_size_per_gpu,
-        batch_size_type=batch_size_type,
-        max_samples=max_samples,
-        grad_accumulation_steps=grad_accumulation_steps,
-        max_grad_norm=max_grad_norm,
         wandb_project="CFM-TTS",
         wandb_run_name=exp_name,
         wandb_resume_id=wandb_resume_id,
-        last_per_steps=last_per_steps,
         log_samples=True,
         mel_spec_type=mel_spec_type,
     )
-    train_dataset = load_dataset(dataset_name, tokenizer, mel_spec_kwargs=mel_spec_kwargs)
     trainer.train(
         train_dataset,
         resumable_with_seed=666,  # seed for shuffling dataset
     )

 # training script.
+import os
 from importlib.resources import files
+import hydra
 from f5_tts.model import CFM, DiT, Trainer, UNetT
 from f5_tts.model.dataset import load_dataset
 from f5_tts.model.utils import get_tokenizer
+os.chdir(str(files("f5_tts").joinpath("../..")))  # change working directory to root of project (local editable)
+@hydra.main(version_base="1.3", config_path=str(files("f5_tts").joinpath("configs")), config_name=None)
+def main(cfg):
+    tokenizer = cfg.model.tokenizer
+    mel_spec_type = cfg.model.mel_spec.mel_spec_type
+    exp_name = f"{cfg.model.name}_{mel_spec_type}_{cfg.model.tokenizer}_{cfg.datasets.name}"
+    # set text tokenizer
+    if tokenizer != "custom":
+        tokenizer_path = cfg.datasets.name
     else:
+        tokenizer_path = cfg.model.tokenizer_path
     vocab_char_map, vocab_size = get_tokenizer(tokenizer_path, tokenizer)
+    # set model
+    if "F5TTS" in cfg.model.name:
+        model_cls = DiT
+    elif "E2TTS" in cfg.model.name:
+        model_cls = UNetT
+    wandb_resume_id = None
     model = CFM(
+        transformer=model_cls(**cfg.model.arch, text_num_embeds=vocab_size, mel_dim=cfg.model.mel_spec.n_mel_channels),
+        mel_spec_kwargs=cfg.model.mel_spec,
         vocab_char_map=vocab_char_map,
     )
+    # init trainer
     trainer = Trainer(
         model,
+        epochs=cfg.optim.epochs,
+        learning_rate=cfg.optim.learning_rate,
+        num_warmup_updates=cfg.optim.num_warmup_updates,
+        save_per_updates=cfg.ckpts.save_per_updates,
+        checkpoint_path=str(files("f5_tts").joinpath(f"../../{cfg.ckpts.save_dir}")),
+        batch_size=cfg.datasets.batch_size_per_gpu,
+        batch_size_type=cfg.datasets.batch_size_type,
+        max_samples=cfg.datasets.max_samples,
+        grad_accumulation_steps=cfg.optim.grad_accumulation_steps,
+        max_grad_norm=cfg.optim.max_grad_norm,
+        logger=cfg.ckpts.logger,
         wandb_project="CFM-TTS",
         wandb_run_name=exp_name,
         wandb_resume_id=wandb_resume_id,
+        last_per_steps=cfg.ckpts.last_per_steps,
         log_samples=True,
+        bnb_optimizer=cfg.optim.bnb_optimizer,
         mel_spec_type=mel_spec_type,
+        is_local_vocoder=cfg.model.vocoder.is_local,
+        local_vocoder_path=cfg.model.vocoder.local_path,
     )
+    train_dataset = load_dataset(cfg.datasets.name, tokenizer, mel_spec_kwargs=cfg.model.mel_spec)
     trainer.train(
         train_dataset,
+        num_workers=cfg.datasets.num_workers,
         resumable_with_seed=666,  # seed for shuffling dataset
     )