KaniTTS

Runtime error

App Files Files Community

jblast94 commited on 23 days ago

Commit

ed9e433

verified ·

1 Parent(s): 24c936f

Update app.py

Browse files

Files changed (1) hide show

app.py +146 -97

app.py CHANGED Viewed

@@ -1,130 +1,179 @@
 import gradio as gr
 import torch
 import os
-# You must use the exact same model name as your repo
-MODEL_ID = "nineninesix/Kani-TTS-370m"
-# --- Global variable to store loaded models ---
-MODELS = {}
 @spaces.GPU
-def load_models():
-    """Load models into GPU memory and store in a global variable."""
-    global MODELS
-    if not MODELS:
-        print("Loading models into GPU memory...")
-        from transformers import AutoModel, AutoConfig
-        model_path = MODEL_ID
-        # Load both the main model and its configuration
-        model = AutoModel.from_pretrained(model_path, trust_remote_code=True)
-        config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
-        # Store the loaded model and its configuration in the global variable
-        MODELS = {
-            "Kani TTS 370M": (model, config)
-        }
-        print(f"Models loaded. Available speakers: {list(config.speaker_id.keys()) if config.speaker_id else []}")
-        return MODELS
-# --- Define a separate function for updating the stats display ---
-def update_stats_display():
-    """This function gets the agent's stats and returns a formatted string for Gradio."""
-    # This assumes 'agent' is a global instance of your ConversationalAgent class
-    stats_text = agent.get_memory_stats()
-    return gr.Markdown(f"### 📊 Memory Stats\n{stats_text}")
-def generate_speech(text: str, model_choice: str, speaker_display: str):
-    """Generate speech using the selected model."""
     if not text.strip():
-        return "Please enter text for speech generation.", None
     try:
         device = "cuda" if torch.cuda.is_available() else "cpu"
         print(f"Using device: {device}")
-        # Ensure models are loaded
-        if not MODELS:
-            load_models()
-        # Get the selected model from the global variable
-        if model_choice not in MODELS:
-            return f"Model '{model_choice}' not found.", None
-        selected_model = MODELS[model_choice]
-        # --- This is the key part to load a specific model ---
-        model_to_generate = selected_model[0]
-        cfg = selected_model[1]  # Model config
         speaker_map = cfg.get('speaker_id', {}) if cfg is not None else {}
         if speaker_display and speaker_map:
             speaker_id = speaker_map.get(speaker_display)
         else:
             speaker_id = None
-        print(f"Generating speech with {model_choice}...")
-        # --- Use the specific part of the model for generation ---
-        audio, _, time_report = model_to_generate.run_model(
-            text=text,
-            speaker_id=speaker_id,
-            temperature=0.7,
-            repetition_penalty=1.2,
-            max_tokens=1024
-        )
-        sample_rate = 22050
         print("Speech generation completed!")
-        return (sample_rate, audio), time_report
-# --- Create and configure the Gradio interface ---
-MODELS = load_models()
-with gr.Blocks(title="😻 KaniTTS - Text to Speech") as demo:
     gr.Markdown("# 😻 KaniTTS: Fast and Expressive Speech Generation Model")
-    model_dropdown = gr.Dropdown(
-        choices=list(MODELS.keys()),
-        value=list(MODELS.keys())[0],
-        label="Selected Model"
-    )
-    # --- Speaker selector (populated on model load) ---
-    all_speakers = []
-    if MODELS and list(MODELS.keys())[0] and MODELS[list(MODELS.keys())[0]][1]:
-        all_speakers.extend(list(MODELS[list(MODELS.keys())[0]][1].speaker_id.keys()))
-    all_speakers = sorted(list(set(all_speakers)))
-    speaker_dropdown = gr.Dropdown(
-        choices=all_speakers,
-        value=None,
-        label="Speaker",
-        visible=True,
-        allow_custom_value=True
-    )
-    text_input = gr.Textbox(label="Text", lines=5)
-    generate_btn = gr.Button("Generate Speech", variant="primary")
-    audio_output = gr.Audio(label="Generated Audio", type="numpy")
-    # --- Define the event to update the speakers when the model changes ---
     model_dropdown.change(
-        fn=lambda choice: gr.update(choices=list(MODELS[choice][1].speaker_id.keys()), value=None, visible=True) if MODELS and MODELS[choice][1].speaker_id else gr.update(visible=False),
         inputs=[model_dropdown],
         outputs=[speaker_dropdown]
     )
-    # --- Wire up the main generation button ---
     generate_btn.click(
-        fn=generate_speech,
-        inputs=[text_input, model_dropdown, speaker_dropdown],
-        outputs=[audio_output]
     )
-    # --- This is the API-enabling line ---
-    demo.queue().launch(show_api=True)

+rom create_env import setup_dependencies
+setup_dependencies()
+import spaces
 import gradio as gr
+from util import NemoAudioPlayer, InitModels, load_config, Examples
+import numpy as np
 import torch
 import os
+# Get HuggingFace token
+token_ = os.getenv('HF_TOKEN')
+config = load_config("./model_config.yaml")
+models_configs = config.models
+nemo_player_cfg = config.nemo_player
+examples_cfg = load_config("./examples.yaml")
+examples_maker = Examples(examples_cfg)
+examples = examples_maker()
+player = NemoAudioPlayer(nemo_player_cfg)
+init_models = InitModels(models_configs, player, token_)
+models = init_models()
 @spaces.GPU
+def generate_speech_gpu(text, model_choice, speaker_display: str, t, top_p, rp, max_tok):
+    """
+    Generate speech from text using the selected model on GPU
+    """
     if not text.strip():
+        return None, "Please enter text for speech generation."
+    if not model_choice:
+        return None, "Please select a model."
     try:
         device = "cuda" if torch.cuda.is_available() else "cpu"
         print(f"Using device: {device}")
+        selected_model = models[model_choice]
+        cfg = models_configs.get(model_choice)
         speaker_map = cfg.get('speaker_id', {}) if cfg is not None else {}
         if speaker_display and speaker_map:
             speaker_id = speaker_map.get(speaker_display)
         else:
             speaker_id = None
+        print(f"Generating speech with {model_choice}...")
+        audio, _, time_report = selected_model.run_model(text, speaker_id, t, top_p, rp, max_tok)
+        sample_rate = 22050
         print("Speech generation completed!")
+        return (sample_rate, audio), time_report   #, f"✅ Audio generated successfully using {model_choice} on {device}"
+    except Exception as e:
+        print(f"Error during generation: {str(e)}")
+        return None, f"❌ Error during generation: {str(e)}"
+# Create Gradio interface
+with gr.Blocks(title="😻 KaniTTS - Text to Speech", theme=gr.themes.Ocean()) as demo:
     gr.Markdown("# 😻 KaniTTS: Fast and Expressive Speech Generation Model")
+    gr.Markdown("Select a model and enter text to generate emotional speech")
+    with gr.Row():
+        with gr.Column(scale=1):
+            model_dropdown = gr.Dropdown(
+                choices=list(models_configs.keys()),
+                value=list(models_configs.keys())[0],
+                label="Selected Model",
+                info="Base generates random voices"
+            )
+            # Speaker selector (shown only if model has speakers)
+            # Pre-populate all available speakers for example table rendering
+            all_speakers = []
+            for _cfg in models_configs.values():
+                if _cfg and _cfg.get('speaker_id'):
+                    all_speakers.extend(list(_cfg.speaker_id.keys()))
+            all_speakers = sorted(list(set(all_speakers)))
+            speaker_dropdown = gr.Dropdown(
+                choices=all_speakers,
+                value=None,
+                label="Speaker",
+                visible=False,
+                allow_custom_value=True
+            )
+            text_input = gr.Textbox(
+                label="Text",
+                placeholder="Enter your text ...",
+                lines=3,
+                max_lines=10
+            )
+            with gr.Accordion("Settings", open=False):
+                temp = gr.Slider(
+                    minimum=0.1, maximum=1.5, value=0.6, step=0.05,
+                    label="Temp",
+                )
+                top_p = gr.Slider(
+                    minimum=0.1, maximum=1.0, value=0.95, step=0.05,
+                    label="Top P",
+                )
+                rp = gr.Slider(
+                    minimum=1.0, maximum=2.0, value=1.1, step=0.05,
+                    label="Repetition Penalty",
+                )
+                max_tok = gr.Slider(
+                    minimum=100, maximum=2000, value=1000, step=100,
+                    label="Max Tokens",
+                )
+            generate_btn = gr.Button("Run", variant="primary", size="lg")
+        with gr.Column(scale=1):
+            audio_output = gr.Audio(
+                label="Generated Audio",
+                type="numpy"
+            )
+            time_report_output = gr.Textbox(
+                label="Time Report",
+                interactive=False,
+                value="Ready to generate speech",
+                lines=3
+            )
+    # Update speakers when model changes
+    def update_speakers(model_choice):
+        cfg = models_configs.get(model_choice)
+        speakers = list(cfg.speaker_id.keys()) if (cfg and cfg.get('speaker_id')) else []
+        if speakers:
+            return gr.update(choices=speakers, value=speakers[0], visible=True)
+        else:
+            return gr.update(choices=[], value=None, visible=False)
     model_dropdown.change(
+        fn=update_speakers,
         inputs=[model_dropdown],
         outputs=[speaker_dropdown]
     )
+    # Populate speakers on initial page load based on default model
+    demo.load(
+        fn=update_speakers,
+        inputs=[model_dropdown],
+        outputs=[speaker_dropdown]
+    )
+    # GPU generation event
     generate_btn.click(
+        fn=generate_speech_gpu,
+        inputs=[text_input, model_dropdown, speaker_dropdown, temp, top_p, rp, max_tok],
+        outputs=[audio_output, time_report_output]
     )
+    with gr.Row():
+        examples = examples
+        gr.Examples(
+            examples=examples,
+            inputs=[text_input, model_dropdown, speaker_dropdown, temp, top_p, rp, max_tok],
+            fn=generate_speech_gpu,
+            outputs=[audio_output, time_report_output],
+            cache_examples=True,
+        )
+if __name__ == "__main__":
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        show_error=True
+    )