Spaces:

RinggAI
/

Ringg-TTS-v1.0

Running

App Files Files Community

utkarshshukla2912 commited on 14 days ago

Commit

8b08d3c

1 Parent(s): ac7c607

added distill model

Browse files

Files changed (3) hide show

app.py +188 -65
generation_counter.json +1 -1
vertex_client.py +125 -7

app.py CHANGED Viewed

@@ -5,6 +5,7 @@ from pathlib import Path
 import uuid
 import fcntl
 import time
 from vertex_client import get_vertex_client
 # gr.NO_RELOAD = False
@@ -152,8 +153,9 @@ def synthesize_speech(text, voice_id):
         if success and audio_bytes:
             print("✅ Synthesized audio using Vertex AI")
-            # Save binary audio to temp file
-            audio_file = f"/tmp/ringg_{str(uuid.uuid4())}.wav"
             with open(audio_file, "wb") as f:
                 f.write(audio_bytes)
@@ -170,7 +172,7 @@ def synthesize_speech(text, voice_id):
                     rtf_no_vocoder
                 ) = ""
-            status_msg = "✅ Audio generated successfully!"
             return (
                 audio_file,
@@ -220,7 +222,7 @@ with gr.Blocks(
     # Best Practices Section
     gr.Markdown("""
-    ### 📝 Best Practices for Best Results
     - **Supported Languages:** Hindi and English only
     - **Check spelling carefully:** Misspelled words may be mispronounced
     - **Punctuation matters:** Use proper punctuation for natural pauses and intonation
@@ -228,41 +230,62 @@ with gr.Blocks(
     - **Numbers & dates:** Write numbers as words for better pronunciation (e.g., "twenty-five" instead of "25")
     """)
-    # Text Input
-    text_input = gr.Textbox(
-        label="Text (max 300 characters)",
-        placeholder="Type or paste your text here (max 300 characters)...",
-        lines=6,
-        max_lines=10,
-        max_length=300,
-    )
-    # Character count display
-    char_count = gr.Markdown("**Character count:** 0 / 300")
     with gr.Row():
         with gr.Column(scale=1):
-            # Voice Selection
-            voices = get_voices()
-            voice_choices = {display: vid for display, vid in voices}
-            voice_dropdown = gr.Dropdown(
-                choices=list(voice_choices.keys()),
-                label="Choose a voice style",
-                info=f"{len(voices)} voices available",
-                value=list(voice_choices.keys())[0] if voices else None,
             )
         with gr.Column(scale=1):
-            audio_output = gr.Audio(label="Listen to your audio", type="filepath")
-            metrics_header = gr.Markdown("### 📊 Generation Metrics", visible=False)
-            metrics_output = gr.Code(
-                label="Metrics", language="json", interactive=False, visible=False
             )
     generate_btn = gr.Button("🎬 Generate Speech", variant="primary", size="lg")
-    gr.Markdown("#### 🎯 Try these examples:")
     with gr.Row():
         example_btn1 = gr.Button("English Example", size="sm")
         example_btn2 = gr.Button("Hindi Example", size="sm")
@@ -280,52 +303,148 @@ with gr.Blocks(
     def update_char_count(text):
         """Update character count as user types"""
         count = len(text) if text else 0
-        return f"**Character count:** {count} / 300"
     def load_example_text(example_text):
         """Load example text and update character count"""
         count = len(example_text)
-        return example_text, f"**Character count:** {count} / 300"
     def clear_text():
         """Clear text input"""
-        return "", "**Character count:** 0 / 300"
     def on_generate(text, voice_display):
         voice_id = voice_choices.get(voice_display)
-        audio_file, _status, t_time, rtf, wav_dur, voc_time, no_voc_time, rtf_no_voc = (
-            synthesize_speech(text, voice_id)
         )
-        # Get fresh counter from file
-        new_count = load_counter()
-        if audio_file:
-            # Atomically increment the UNIVERSAL counter
-            new_count = increment_counter()
-        # Format metrics as JSON string (only if available)
-        has_metrics = any([t_time, rtf, wav_dur, voc_time, no_voc_time, rtf_no_voc])
-        metrics_json = ""
-        if has_metrics:
-            metrics_json = json.dumps(
-                {
-                    "total_time": t_time,
-                    "rtf": rtf,
-                    "audio_duration": wav_dur,
-                    "vocoder_time": voc_time,
-                    "no_vocoder_time": no_voc_time,
-                    "rtf_no_vocoder": rtf_no_voc,
-                },
-                indent=2,
             )
-        return (
-            audio_file,
-            gr.update(visible=has_metrics),
-            gr.update(value=metrics_json, visible=has_metrics),
-            f"**🌍 Generations:** {new_count}",
-        )
     def refresh_counter_on_load():
         """Refresh the universal generation counter when the UI loads/reloads"""
         return f"**🌍 Generations since last reload:** {load_counter()}"
@@ -356,10 +475,14 @@ with gr.Blocks(
         fn=on_generate,
         inputs=[text_input, voice_dropdown],
         outputs=[
-            audio_output,
-            # status_output,
-            metrics_header,
-            metrics_output,
             generation_counter,
         ],
         concurrency_limit=2,

 import uuid
 import fcntl
 import time
+import tempfile
 from vertex_client import get_vertex_client
 # gr.NO_RELOAD = False
         if success and audio_bytes:
             print("✅ Synthesized audio using Vertex AI")
+            # Save binary audio to temp file in system temp directory
+            temp_dir = tempfile.gettempdir()
+            audio_file = os.path.join(temp_dir, f"ringg_{str(uuid.uuid4())}.wav")
             with open(audio_file, "wb") as f:
                 f.write(audio_bytes)
                     rtf_no_vocoder
                 ) = ""
+            status_msg = ""
             return (
                 audio_file,
     # Best Practices Section
     gr.Markdown("""
+    ## 📝 Best Practices for Best Results
     - **Supported Languages:** Hindi and English only
     - **Check spelling carefully:** Misspelled words may be mispronounced
     - **Punctuation matters:** Use proper punctuation for natural pauses and intonation
     - **Numbers & dates:** Write numbers as words for better pronunciation (e.g., "twenty-five" instead of "25")
     """)
+    # Input Section - Text, Voice, and Character Count grouped together
+    with gr.Group():
+        # Text Input
+        text_input = gr.Textbox(
+            label="Text (max 500 characters)",
+            placeholder="Type or paste your text here (max 500 characters)...",
+            lines=6,
+            max_lines=10,
+            max_length=500,
+        )
+        # Voice Selection
+        voices = get_voices()
+        voice_choices = {display: vid for display, vid in voices}
+        voice_dropdown = gr.Dropdown(
+            choices=list(voice_choices.keys()),
+            label="Choose a voice style",
+            info=f"{len(voices)} voices available",
+            value=list(voice_choices.keys())[0] if voices else None,
+            show_label=False,
+        )
+        # Character count display
+        char_count = gr.Code(
+            "Character count: 0 / 500",
+            show_line_numbers=False,
+            show_label=False,
+        )
+    # Side-by-side comparison of Base and Distill models
+    gr.Markdown("### 🎧 Audio Results Comparison")
     with gr.Row():
         with gr.Column(scale=1):
+            # gr.Markdown("#### Base Model")
+            audio_output_base = gr.Audio(label="Base Model Audio", type="filepath")
+            status_base = gr.Markdown("", visible=True)
+            metrics_header_base = gr.Markdown("**📊 Metrics**", visible=False)
+            metrics_output_base = gr.Code(
+                label="Base Metrics", language="json", interactive=False, visible=False
             )
         with gr.Column(scale=1):
+            # gr.Markdown("#### Distill Model")
+            audio_output_distill = gr.Audio(
+                label="Distill Model Audio", type="filepath"
+            )
+            status_distill = gr.Markdown("", visible=True)
+            metrics_header_distill = gr.Markdown("**📊 Metrics**", visible=False)
+            metrics_output_distill = gr.Code(
+                label="Distill Metrics",
+                language="json",
+                interactive=False,
+                visible=False,
             )
     generate_btn = gr.Button("🎬 Generate Speech", variant="primary", size="lg")
     with gr.Row():
         example_btn1 = gr.Button("English Example", size="sm")
         example_btn2 = gr.Button("Hindi Example", size="sm")
     def update_char_count(text):
         """Update character count as user types"""
         count = len(text) if text else 0
+        return f"Character count: {count} / 500"
     def load_example_text(example_text):
         """Load example text and update character count"""
         count = len(example_text)
+        return example_text, f"Character count: {count} / 500"
     def clear_text():
         """Clear text input"""
+        return "", "Character count: 0 / 500"
     def on_generate(text, voice_display):
+        """Generate speech using both base and distill models in parallel."""
+        # Validate inputs
+        if not text or not text.strip():
+            error_msg = "⚠️ Please enter some text"
+            yield (
+                None,
+                error_msg,
+                gr.update(visible=False),
+                gr.update(visible=False),
+                None,
+                error_msg,
+                gr.update(visible=False),
+                gr.update(visible=False),
+                f"**🌍 Generations:** {load_counter()}",
+            )
+            return
         voice_id = voice_choices.get(voice_display)
+        if not voice_id:
+            error_msg = "⚠️ Please select a voice"
+            yield (
+                None,
+                error_msg,
+                gr.update(visible=False),
+                gr.update(visible=False),
+                None,
+                error_msg,
+                gr.update(visible=False),
+                gr.update(visible=False),
+                f"**🌍 Generations:** {load_counter()}",
+            )
+            return
+        # Initialize state for both models
+        results = {
+            "base": {"audio": None, "status": "⏳ Loading...", "metrics": None},
+            "distill": {"audio": None, "status": "⏳ Loading...", "metrics": None},
+        }
+        # Show loading state initially
+        yield (
+            None,
+            results["base"]["status"],
+            gr.update(visible=False),
+            gr.update(visible=False),
+            None,
+            results["distill"]["status"],
+            gr.update(visible=False),
+            gr.update(visible=False),
+            f"**🌍 Generations:** {load_counter()}",
         )
+        # Use parallel synthesis
+        vertex_client = get_vertex_client()
+        counter_incremented = False
+        for (
+            model_type,
+            success,
+            audio_bytes,
+            metrics,
+        ) in vertex_client.synthesize_parallel(text, voice_id):
+            if success and audio_bytes:
+                # Save audio file in system temp directory
+                temp_dir = tempfile.gettempdir()
+                audio_file = os.path.join(
+                    temp_dir, f"ringg_{model_type}_{str(uuid.uuid4())}.wav"
+                )
+                with open(audio_file, "wb") as f:
+                    f.write(audio_bytes)
+                # Increment counter only once (for the first successful result)
+                if not counter_incremented:
+                    new_count = increment_counter()
+                    counter_incremented = True
+                else:
+                    new_count = load_counter()
+                # Format metrics
+                metrics_json = ""
+                has_metrics = False
+                if metrics:
+                    has_metrics = True
+                    metrics_json = json.dumps(
+                        {
+                            "total_time": f"{metrics.get('t', 0):.3f}s",
+                            "rtf": f"{metrics.get('rtf', 0):.4f}",
+                            "audio_duration": f"{metrics.get('wav_seconds', 0):.2f}s",
+                            "vocoder_time": f"{metrics.get('t_vocoder', 0):.3f}s",
+                            "no_vocoder_time": f"{metrics.get('t_no_vocoder', 0):.3f}s",
+                            "rtf_no_vocoder": f"{metrics.get('rtf_no_vocoder', 0):.4f}",
+                        },
+                        indent=2,
+                    )
+                # Update the corresponding model result
+                results[model_type] = {
+                    "audio": audio_file,
+                    "status": "",
+                    "metrics": metrics_json,
+                    "has_metrics": has_metrics,
+                }
+            else:
+                # Update failed model
+                results[model_type] = {
+                    "audio": None,
+                    "status": "❌ Failed to generate",
+                    "metrics": "",
+                    "has_metrics": False,
+                }
+            # Yield updated state for both models
+            yield (
+                results["base"]["audio"],
+                results["base"]["status"],
+                gr.update(visible=results["base"].get("has_metrics", False)),
+                gr.update(
+                    value=results["base"]["metrics"],
+                    visible=results["base"].get("has_metrics", False),
+                ),
+                results["distill"]["audio"],
+                results["distill"]["status"],
+                gr.update(visible=results["distill"].get("has_metrics", False)),
+                gr.update(
+                    value=results["distill"]["metrics"],
+                    visible=results["distill"].get("has_metrics", False),
+                ),
+                f"**🌍 Generations:** {new_count if counter_incremented else load_counter()}",
             )
     def refresh_counter_on_load():
         """Refresh the universal generation counter when the UI loads/reloads"""
         return f"**🌍 Generations since last reload:** {load_counter()}"
         fn=on_generate,
         inputs=[text_input, voice_dropdown],
         outputs=[
+            audio_output_base,
+            status_base,
+            metrics_header_base,
+            metrics_output_base,
+            audio_output_distill,
+            status_distill,
+            metrics_header_distill,
+            metrics_output_distill,
             generation_counter,
         ],
         concurrency_limit=2,

generation_counter.json CHANGED Viewed

	@@ -1 +1 @@
1	- {"count": 3, "last_updated": ~~1762495500~~.~~191227~~}


1	+ {"count": 10, "last_updated": 1762780862.430711}

vertex_client.py CHANGED Viewed

@@ -5,7 +5,8 @@ import os
 import json
 import logging
 import requests
-from typing import Optional, Dict, Any, Tuple
 from google.cloud import aiplatform
 from google.oauth2 import service_account
 from dotenv import load_dotenv
@@ -24,6 +25,7 @@ class VertexAIClient:
     def __init__(self):
         """Initialize the Vertex AI client."""
         self.endpoint = None
         self.credentials = None
         self.initialized = False
@@ -57,7 +59,7 @@ class VertexAIClient:
     def initialize(self) -> bool:
         """
-        Initialize Vertex AI and find the zipvoice endpoint.
         Returns:
             True if initialization successful, False otherwise
@@ -80,16 +82,26 @@ class VertexAIClient:
             )
             logger.info("Vertex AI initialized for project desivocalprod01")
-            # Find the zipvoice endpoint
             for endpoint in aiplatform.Endpoint.list():
                 if endpoint.display_name == "zipvoice":
                     self.endpoint = endpoint
-                    self.initialized = True
                     logger.info(f"Found zipvoice endpoint: {endpoint.resource_name}")
-                    return True
-            logger.error("zipvoice endpoint not found in Vertex AI")
-            return False
         except Exception as e:
             logger.error(f"Failed to initialize Vertex AI: {e}")
@@ -185,6 +197,112 @@ class VertexAIClient:
             logger.error(f"Failed to synthesize speech with Vertex AI: {e}")
             return False, None, None
 # Global instance
 _vertex_client = None

 import json
 import logging
 import requests
+from typing import Optional, Dict, Any, Tuple, Generator
+from concurrent.futures import ThreadPoolExecutor, as_completed
 from google.cloud import aiplatform
 from google.oauth2 import service_account
 from dotenv import load_dotenv
     def __init__(self):
         """Initialize the Vertex AI client."""
         self.endpoint = None
+        self.endpoint_distill = None
         self.credentials = None
         self.initialized = False
     def initialize(self) -> bool:
         """
+        Initialize Vertex AI and find the zipvoice and zipvoice_base_distill endpoints.
         Returns:
             True if initialization successful, False otherwise
             )
             logger.info("Vertex AI initialized for project desivocalprod01")
+            # Find both endpoints
             for endpoint in aiplatform.Endpoint.list():
                 if endpoint.display_name == "zipvoice":
                     self.endpoint = endpoint
                     logger.info(f"Found zipvoice endpoint: {endpoint.resource_name}")
+                elif endpoint.display_name == "zipvoice_base_distill":
+                    self.endpoint_distill = endpoint
+                    logger.info(f"Found zipvoice_base_distill endpoint: {endpoint.resource_name}")
+            # Check if at least the base endpoint is found
+            if not self.endpoint:
+                logger.error("zipvoice endpoint not found in Vertex AI")
+                return False
+            # Warn if distill endpoint is not found but continue
+            if not self.endpoint_distill:
+                logger.warning("zipvoice_base_distill endpoint not found - distill model will not be available")
+            self.initialized = True
+            return True
         except Exception as e:
             logger.error(f"Failed to initialize Vertex AI: {e}")
             logger.error(f"Failed to synthesize speech with Vertex AI: {e}")
             return False, None, None
+    def synthesize_distill(self, text: str, voice_id: str, timeout: int = 60) -> Tuple[bool, Optional[bytes], Optional[Dict[str, Any]]]:
+        """
+        Synthesize speech from text using Vertex AI distill endpoint.
+        Args:
+            text: Text to synthesize
+            voice_id: Voice ID to use
+            timeout: Request timeout in seconds
+        Returns:
+            Tuple of (success, audio_bytes, metrics)
+        """
+        if not self.initialized:
+            if not self.initialize():
+                return False, None, None
+        if not self.endpoint_distill:
+            logger.error("Distill endpoint not available")
+            return False, None, None
+        try:
+            logger.info(f"Synthesizing text (length: {len(text)}) with voice {voice_id} using distill model")
+            response = self.endpoint_distill.raw_predict(
+                body=json.dumps({
+                    "text": text,
+                    "voice_id": voice_id,
+                    "model_type": "distill",
+                }),
+                headers={"Content-Type": "application/json"},
+            )
+            # Parse JSON response
+            result = json.loads(response.text) if hasattr(response, 'text') else response
+            logger.info(f"Vertex AI distill response: {result}")
+            # Check if synthesis was successful
+            if result.get("success"):
+                audio_url = result.get("audio_url")
+                metrics = result.get("metrics")
+                if not audio_url:
+                    logger.error("No audio_url in successful response")
+                    return False, None, None
+                # Download audio from URL
+                logger.info(f"Downloading audio from: {audio_url}")
+                audio_response = requests.get(audio_url, timeout=timeout)
+                if audio_response.status_code == 200:
+                    audio_data = audio_response.content
+                    logger.info(f"Successfully downloaded audio ({len(audio_data)} bytes)")
+                    return True, audio_data, metrics
+                else:
+                    logger.error(f"Failed to download audio: HTTP {audio_response.status_code}")
+                    return False, None, None
+            else:
+                error_msg = result.get("message", "Unknown error")
+                logger.error(f"Synthesis failed: {error_msg}")
+                return False, None, None
+        except Exception as e:
+            logger.error(f"Failed to synthesize speech with Vertex AI distill: {e}")
+            return False, None, None
+    def synthesize_parallel(self, text: str, voice_id: str, timeout: int = 60) -> Generator[Tuple[str, bool, Optional[bytes], Optional[Dict[str, Any]]], None, None]:
+        """
+        Synthesize speech from text using both base and distill endpoints in parallel.
+        Yields results as they arrive (doesn't wait for both to complete).
+        Args:
+            text: Text to synthesize
+            voice_id: Voice ID to use
+            timeout: Request timeout in seconds
+        Yields:
+            Tuple of (model_type, success, audio_bytes, metrics)
+            model_type is either "base" or "distill"
+        """
+        if not self.initialized:
+            if not self.initialize():
+                logger.error("Failed to initialize client for parallel synthesis")
+                return
+        # Create executor for parallel execution
+        with ThreadPoolExecutor(max_workers=2) as executor:
+            # Submit both tasks
+            futures = {}
+            # Always submit base model
+            futures[executor.submit(self.synthesize, text, voice_id, timeout)] = "base"
+            # Submit distill model if available
+            if self.endpoint_distill:
+                futures[executor.submit(self.synthesize_distill, text, voice_id, timeout)] = "distill"
+            # Yield results as they complete
+            for future in as_completed(futures):
+                model_type = futures[future]
+                try:
+                    success, audio_bytes, metrics = future.result()
+                    yield model_type, success, audio_bytes, metrics
+                except Exception as e:
+                    logger.error(f"Error in parallel synthesis for {model_type}: {e}")
+                    yield model_type, False, None, None
 # Global instance
 _vertex_client = None