Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import numpy as np | |
| import os | |
| def synthesize_speech(text, speaker_id=0): | |
| """ | |
| Placeholder function for speech synthesis | |
| Replace this with actual model inference when you have trained models | |
| """ | |
| if not text.strip(): | |
| return None | |
| # This is a placeholder - replace with actual model inference | |
| sample_rate = 24000 | |
| duration = max(1.0, len(text) * 0.08) # rough estimate | |
| samples = int(sample_rate * duration) | |
| # Generate simple sine wave as placeholder | |
| t = np.linspace(0, duration, samples) | |
| frequency = 440 + (speaker_id * 50) # vary frequency by speaker | |
| # Create a more interesting waveform | |
| audio = ( | |
| 0.3 * np.sin(2 * np.pi * frequency * t) * np.exp(-t/(duration*0.8)) + | |
| 0.1 * np.sin(2 * np.pi * frequency * 2 * t) * np.exp(-t/duration) + | |
| 0.05 * np.random.randn(samples) # add some noise | |
| ) | |
| # Apply fade in/out | |
| fade_samples = int(0.1 * sample_rate) | |
| audio[:fade_samples] *= np.linspace(0, 1, fade_samples) | |
| audio[-fade_samples:] *= np.linspace(1, 0, fade_samples) | |
| return (sample_rate, audio.astype(np.float32)) | |
| def create_demo(): | |
| with gr.Blocks( | |
| title="Learnable-Speech Demo", | |
| theme=gr.themes.Default(), | |
| css=""" | |
| .gradio-container { | |
| max-width: 1200px !important; | |
| } | |
| """ | |
| ) as demo: | |
| gr.Markdown( | |
| """ | |
| # 🎤 Learnable-Speech: High-Quality 24kHz Speech Synthesis | |
| An unofficial implementation based on improvements of CosyVoice with learnable encoder and DAC-VAE. | |
| > **⚠️ This is a demo interface with placeholder audio. To use the actual model, you need to train it first!** | |
| ## 🚀 How to Train Your Own Model: | |
| 1. **Follow the [Training Guide](https://github.com/primepake/learnable-speech/blob/main/TRAINING_GUIDE.md)** | |
| 2. **Use the provided training scripts** in the `scripts/` directory | |
| 3. **Upload your trained models** to Hugging Face Hub | |
| 4. **Replace the placeholder code** in this Space with your models | |
| ### Quick Start: | |
| ```bash | |
| # 1. Prepare your dataset | |
| ./scripts/prepare_data.sh | |
| # 2. Train the model | |
| ./scripts/train_full_pipeline.sh | |
| # 3. Upload to Hugging Face | |
| python scripts/upload_to_hf.py --username your_username | |
| ``` | |
| """ | |
| ) | |
| with gr.Row(): | |
| with gr.Column(): | |
| text_input = gr.Textbox( | |
| label="Text to synthesize", | |
| placeholder="Enter text here...", | |
| lines=3, | |
| value="Hello, this is a demo of Learnable-Speech synthesis." | |
| ) | |
| with gr.Row(): | |
| speaker_slider = gr.Slider( | |
| minimum=0, | |
| maximum=10, | |
| value=0, | |
| step=1, | |
| label="Speaker ID" | |
| ) | |
| generate_btn = gr.Button("🎵 Generate Speech", variant="primary", size="lg") | |
| with gr.Column(): | |
| audio_output = gr.Audio( | |
| label="Generated Speech", | |
| type="numpy" | |
| ) | |
| with gr.Accordion("🎯 Training Status & Next Steps", open=True): | |
| gr.Markdown( | |
| """ | |
| ### 📋 Current Status: | |
| - ✅ **Demo Interface**: Ready | |
| - ❌ **Trained Models**: Not available (placeholder audio only) | |
| - ❌ **Model Inference**: Not implemented yet | |
| ### 🔧 To Enable Real Speech Synthesis: | |
| 1. **Train the models** using the provided pipeline | |
| 2. **Upload trained checkpoints** to Hugging Face Hub | |
| 3. **Update the inference code** in `synthesize_speech()` function | |
| 4. **Test with real model outputs** | |
| ### 📚 Resources: | |
| - [📖 Complete Training Guide](https://github.com/primepake/learnable-speech/blob/main/TRAINING_GUIDE.md) | |
| - [🛠️ Training Scripts](https://github.com/primepake/learnable-speech/tree/main/scripts) | |
| - [📄 Research Paper](https://arxiv.org/pdf/2505.07916) | |
| - [💻 GitHub Repository](https://github.com/primepake/learnable-speech) | |
| """ | |
| ) | |
| gr.Markdown( | |
| """ | |
| ### Key Features | |
| - **24kHz Audio Support**: High-quality audio generation at 24kHz sampling rate | |
| - **Flow matching AE**: Flow matching training for autoencoders | |
| - **Immiscible assignment**: Support immiscible adding noise while training | |
| - **Contrastive Flow matching**: Support Contrastive training | |
| ### Architecture | |
| **Stage 1**: Audio to Discrete Tokens - Converts raw audio into discrete representations using FSQ (S3Tokenizer) | |
| **Stage 2**: Discrete Tokens to Continuous Latent Space - Maps discrete tokens to continuous latent space using VAE | |
| ### Training Pipeline | |
| 1. Extract discrete tokens using trained FSQ S3Tokenizer | |
| 2. Generate continuous latent representations using trained DAC-VAE | |
| 3. Train Stage 1: BPE tokens → Discrete FSQ | |
| 4. Train Stage 2: Discrete FSQ → DAC-VAE Continuous latent space | |
| ### Links | |
| - [GitHub Repository](https://github.com/primepake/learnable-speech) | |
| - [Technical Paper](https://arxiv.org/pdf/2505.07916) | |
| """ | |
| ) | |
| with gr.Row(): | |
| gr.Examples( | |
| examples=[ | |
| ["Hello everyone! I am here to tell you that Learnable-Speech is amazing!"], | |
| ["The Secret Service believed that it was very doubtful that any President would ride regularly in a vehicle."], | |
| ["We propose Learnable-Speech, a new approach to neural text-to-speech synthesis."], | |
| ["This implementation uses flow matching for high-quality 24kHz audio generation."], | |
| ], | |
| inputs=[text_input], | |
| fn=lambda x: synthesize_speech(x, 0), | |
| outputs=audio_output, | |
| cache_examples=False, | |
| label="Example Texts" | |
| ) | |
| generate_btn.click( | |
| fn=synthesize_speech, | |
| inputs=[text_input, speaker_slider], | |
| outputs=audio_output | |
| ) | |
| return demo | |
| if __name__ == "__main__": | |
| # Get environment variables for flexible deployment | |
| port = int(os.environ.get("PORT", 7860)) | |
| host = os.environ.get("HOST", "0.0.0.0") | |
| demo = create_demo() | |
| # Try to launch with error handling | |
| try: | |
| demo.launch( | |
| server_name=host, | |
| server_port=port, | |
| share=False, | |
| show_error=True, | |
| quiet=False, | |
| enable_queue=True | |
| ) | |
| except Exception as e: | |
| print(f"Failed to launch on {host}:{port}, trying with share=True") | |
| demo.launch( | |
| share=True, | |
| show_error=True, | |
| quiet=False, | |
| enable_queue=True | |
| ) | |