VoxCPM-0.5B

Running

App Files Files Community

akhaliq HF Staff commited on Sep 17

Commit

67d30f5

verified ·

1 Parent(s): d9396d8

Upload app.py with huggingface_hub

Browse files

Files changed (1) hide show

app.py +204 -0

app.py ADDED Viewed

	@@ -0,0 +1,204 @@

+import gradio as gr
+import soundfile as sf
+import numpy as np
+from voxcpm import VoxCPM
+import tempfile
+import os
+import spaces
+# Load the model once at startup
+model = VoxCPM.from_pretrained("openbmb/VoxCPM-0.5B")
+@spaces.GPU(duration=120)
+def generate_speech(
+    text,
+    prompt_audio,
+    prompt_text,
+    cfg_value,
+    inference_timesteps,
+    normalize,
+    denoise,
+    retry_badcase,
+    retry_badcase_max_times,
+    retry_badcase_ratio_threshold
+):
+    if not text:
+        gr.Warning("Please enter text to generate speech")
+        return None
+    # Handle prompt audio if provided
+    prompt_wav_path = None
+    if prompt_audio is not None:
+        prompt_wav_path = prompt_audio
+    # Handle empty prompt text
+    if prompt_text and prompt_text.strip() == "":
+        prompt_text = None
+    try:
+        # Generate speech
+        wav = model.generate(
+            text=text,
+            prompt_wav_path=prompt_wav_path,
+            prompt_text=prompt_text,
+            cfg_value=cfg_value,
+            inference_timesteps=int(inference_timesteps),
+            normalize=normalize,
+            denoise=denoise,
+            retry_badcase=retry_badcase,
+            retry_badcase_max_times=int(retry_badcase_max_times),
+            retry_badcase_ratio_threshold=retry_badcase_ratio_threshold
+        )
+        # Create temporary file for audio output
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
+            sf.write(tmp_file.name, wav, 16000)
+            return tmp_file.name
+    except Exception as e:
+        gr.Error(f"Error generating speech: {str(e)}")
+        return None
+# Create Gradio interface
+with gr.Blocks(title="VoxCPM Text-to-Speech") as demo:
+    gr.Markdown(
+        """
+        # 🎙️ VoxCPM Text-to-Speech
+        Generate highly expressive speech using VoxCPM-0.5B model. Optionally clone voices by providing reference audio.
+        [Built with anycoder](https://huggingface.co/spaces/akhaliq/anycoder)
+        """
+    )
+    with gr.Row():
+        with gr.Column(scale=1):
+            # Input section
+            text_input = gr.Textbox(
+                label="Text to Synthesize",
+                placeholder="Enter the text you want to convert to speech...",
+                lines=3,
+                value="VoxCPM is an innovative end-to-end TTS model from ModelBest, designed to generate highly expressive speech."
+            )
+            with gr.Accordion("Voice Cloning (Optional)", open=False):
+                prompt_audio = gr.Audio(
+                    label="Reference Audio",
+                    type="filepath",
+                    sources=["upload"],
+                    info="Upload a reference audio file for voice cloning"
+                )
+                prompt_text = gr.Textbox(
+                    label="Reference Text",
+                    placeholder="Text corresponding to the reference audio (optional)",
+                    lines=2
+                )
+            with gr.Accordion("Advanced Settings", open=False):
+                cfg_value = gr.Slider(
+                    minimum=0.5,
+                    maximum=5.0,
+                    value=2.0,
+                    step=0.1,
+                    label="CFG Value",
+                    info="LM guidance on LocDiT, higher for better adherence to prompt"
+                )
+                inference_timesteps = gr.Slider(
+                    minimum=5,
+                    maximum=50,
+                    value=10,
+                    step=1,
+                    label="Inference Timesteps",
+                    info="Higher for better quality, lower for faster speed"
+                )
+                with gr.Row():
+                    normalize = gr.Checkbox(
+                        value=True,
+                        label="Normalize",
+                        info="Enable external TN tool"
+                    )
+                    denoise = gr.Checkbox(
+                        value=True,
+                        label="Denoise",
+                        info="Enable external Denoise tool"
+                    )
+                    retry_badcase = gr.Checkbox(
+                        value=True,
+                        label="Retry Bad Cases",
+                        info="Enable retrying for bad cases"
+                    )
+                with gr.Row():
+                    retry_badcase_max_times = gr.Number(
+                        value=3,
+                        minimum=1,
+                        maximum=10,
+                        step=1,
+                        label="Max Retry Times"
+                    )
+                    retry_badcase_ratio_threshold = gr.Number(
+                        value=6.0,
+                        minimum=1.0,
+                        maximum=10.0,
+                        step=0.5,
+                        label="Retry Ratio Threshold"
+                    )
+            generate_btn = gr.Button("🎵 Generate Speech", variant="primary", size="lg")
+        with gr.Column(scale=1):
+            # Output section
+            audio_output = gr.Audio(
+                label="Generated Speech",
+                type="filepath",
+                autoplay=False
+            )
+            gr.Markdown(
+                """
+                ### Tips:
+                - For voice cloning, upload a clear reference audio (3-10 seconds recommended)
+                - Higher CFG values provide better prompt adherence but may affect naturalness
+                - Increase inference timesteps for better quality at the cost of speed
+                - The retry mechanism helps handle edge cases automatically
+                """
+            )
+    # Examples
+    gr.Examples(
+        examples=[
+            ["Hello! Welcome to the VoxCPM text-to-speech demonstration. This model can generate highly expressive and natural-sounding speech.", None, None, 2.0, 10],
+            ["The quick brown fox jumps over the lazy dog. This pangram contains all letters of the alphabet.", None, None, 2.5, 15],
+            ["Artificial intelligence is transforming the way we interact with technology, making it more natural and intuitive.", None, None, 2.0, 10],
+        ],
+        inputs=[text_input, prompt_audio, prompt_text, cfg_value, inference_timesteps],
+        outputs=audio_output,
+        fn=lambda t, pa, pt, cfg, its: generate_speech(
+            t, pa, pt, cfg, its, True, True, True, 3, 6.0
+        ),
+        cache_examples=True,
+        cache_mode="lazy"
+    )
+    # Connect the generate button
+    generate_btn.click(
+        fn=generate_speech,
+        inputs=[
+            text_input,
+            prompt_audio,
+            prompt_text,
+            cfg_value,
+            inference_timesteps,
+            normalize,
+            denoise,
+            retry_badcase,
+            retry_badcase_max_times,
+            retry_badcase_ratio_threshold
+        ],
+        outputs=audio_output,
+        show_progress="full"
+    )
+demo.launch()