Spaces:
Sleeping
Sleeping
| """ | |
| Global Video Localizer | |
| Automated video localization using AI-powered transcription, translation, and voice synthesis. | |
| """ | |
| import gradio as gr | |
| from localizer_engine import ( | |
| process_video, | |
| validate_elevenlabs_api_key, | |
| ) | |
| def apply_gradio_patch(): | |
| """Apply workaround for Gradio's JSON schema parsing bug.""" | |
| import gradio_client.utils as gradio_utils | |
| original_get_type = gradio_utils.get_type | |
| original_json_schema_to_python_type = gradio_utils._json_schema_to_python_type | |
| def patched_get_type(schema): | |
| if not isinstance(schema, dict): | |
| return "any" | |
| try: | |
| return original_get_type(schema) | |
| except TypeError: | |
| return "any" | |
| def patched_json_schema_to_python_type(schema, defs): | |
| if not isinstance(schema, dict): | |
| return "Any" | |
| try: | |
| return original_json_schema_to_python_type(schema, defs) | |
| except TypeError: | |
| return "Any" | |
| gradio_utils.get_type = patched_get_type | |
| gradio_utils._json_schema_to_python_type = patched_json_schema_to_python_type | |
| import gradio_client.utils | |
| gradio_client.utils.get_type = patched_get_type | |
| gradio_client.utils._json_schema_to_python_type = patched_json_schema_to_python_type | |
| apply_gradio_patch() | |
| def localize_video(video_path, target_language, api_key=None, progress=gr.Progress(track_tqdm=True)): | |
| """Process video localization request (keys stay per-session and are not persisted).""" | |
| if not video_path: | |
| return None, "Please upload a video to get started.", "" | |
| key = api_key.strip() if api_key and api_key.strip() else None | |
| progress(0, desc="Queued...") | |
| try: | |
| output_path, original_text, translated_text = process_video( | |
| video_path, | |
| target_language, | |
| elevenlabs_api_key=key, | |
| progress_callback=progress, | |
| ) | |
| return output_path, original_text, translated_text | |
| except Exception as e: | |
| error_message = f"Processing failed: {str(e)}" | |
| return None, error_message, "" | |
| # Design System | |
| CSS = """ | |
| @import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&display=swap'); | |
| :root { | |
| --peach: #ffad7a; | |
| --peach-dark: #e8935c; | |
| --lavender: #b8a9d9; | |
| --sky-blue: #7ACCFF; | |
| --bg-light: #f9fafb; | |
| --surface: #ffffff; | |
| --text-primary: #1f2937; | |
| --text-secondary: #4b5563; | |
| --text-muted: #6b7280; | |
| --border-default: #e5e7eb; | |
| --border-subtle: #f3f4f6; | |
| --accent: #ffad7a; | |
| --accent-hover: #e8935c; | |
| --accent-subtle: rgba(255, 173, 122, 0.1); | |
| --shadow-sm: 0 1px 2px rgba(0, 0, 0, 0.05); | |
| --shadow-md: 0 4px 12px rgba(0, 0, 0, 0.08); | |
| --shadow-lg: 0 8px 24px rgba(0, 0, 0, 0.12); | |
| } | |
| body { | |
| background: var(--bg-light) !important; | |
| color: var(--text-primary) !important; | |
| font-family: 'Inter', 'Helvetica Neue', 'Segoe UI', system-ui, -apple-system, sans-serif !important; | |
| -webkit-font-smoothing: antialiased; | |
| font-weight: 400; | |
| letter-spacing: -0.01em; | |
| } | |
| .gradio-container { | |
| max-width: 100% !important; | |
| background: var(--bg-light) !important; | |
| font-family: 'Inter', 'Helvetica Neue', 'Segoe UI', system-ui, -apple-system, sans-serif !important; | |
| } | |
| .main-header { | |
| text-align: center; | |
| padding: 2.5rem 2rem; | |
| background: linear-gradient(135deg, var(--peach) 0%, var(--lavender) 50%, var(--sky-blue) 100%); | |
| border-radius: 20px; | |
| margin: 1rem; | |
| box-shadow: var(--shadow-lg), 0 0 30px rgba(255, 173, 122, 0.2); | |
| position: relative; | |
| overflow: hidden; | |
| } | |
| .main-header::before { | |
| content: ''; | |
| position: absolute; | |
| top: 0; | |
| left: 0; | |
| right: 0; | |
| bottom: 0; | |
| background: radial-gradient(ellipse at 30% 20%, rgba(255,255,255,0.35) 0%, transparent 50%); | |
| pointer-events: none; | |
| } | |
| .main-header h1 { | |
| font-size: 2.75rem; | |
| font-weight: 600; | |
| color: #ffffff; | |
| margin-bottom: 0.5rem; | |
| text-shadow: 0 2px 8px rgba(0,0,0,0.15); | |
| letter-spacing: -0.03em; | |
| position: relative; | |
| font-family: 'Inter', 'Helvetica Neue', system-ui, sans-serif; | |
| } | |
| .main-header h3 { | |
| color: rgba(255, 255, 255, 0.95); | |
| font-size: 1.1rem; | |
| font-weight: 450; | |
| position: relative; | |
| } | |
| .main-header p { | |
| color: rgba(255, 255, 255, 0.95); | |
| font-size: 1rem; | |
| font-weight: 400; | |
| position: relative; | |
| } | |
| input, select, textarea { | |
| background: var(--bg-light) !important; | |
| border: 1px solid var(--border-default) !important; | |
| color: var(--text-primary) !important; | |
| border-radius: 8px !important; | |
| transition: all 0.15s ease !important; | |
| font-family: 'Inter', sans-serif !important; | |
| } | |
| input:focus, select:focus, textarea:focus { | |
| border-color: var(--accent) !important; | |
| box-shadow: 0 0 0 3px var(--accent-subtle) !important; | |
| outline: none !important; | |
| } | |
| button.primary, button[class*="primary"] { | |
| background: linear-gradient(135deg, var(--accent) 0%, var(--accent-hover) 100%) !important; | |
| color: #ffffff !important; | |
| font-weight: 600 !important; | |
| border: none !important; | |
| border-radius: 10px !important; | |
| padding: 0.75rem 1.5rem !important; | |
| transition: all 0.2s ease !important; | |
| box-shadow: 0 2px 8px rgba(255, 173, 122, 0.3) !important; | |
| font-family: 'Inter', sans-serif !important; | |
| } | |
| button.primary:hover, button[class*="primary"]:hover { | |
| background: linear-gradient(135deg, var(--accent-hover) 0%, #d67d45 100%) !important; | |
| transform: translateY(-1px) !important; | |
| box-shadow: 0 4px 16px rgba(255, 173, 122, 0.4) !important; | |
| } | |
| label { | |
| color: var(--text-secondary) !important; | |
| font-weight: 500 !important; | |
| font-size: 0.875rem !important; | |
| font-family: 'Inter', sans-serif !important; | |
| } | |
| .markdown-text h3, h3 { | |
| color: var(--text-primary) !important; | |
| font-weight: 600 !important; | |
| font-size: 1rem !important; | |
| margin-bottom: 0.5rem !important; | |
| font-family: 'Inter', sans-serif !important; | |
| } | |
| .markdown-text, .markdown-text p, .markdown-text span { | |
| color: var(--text-primary) !important; | |
| font-family: 'Inter', sans-serif !important; | |
| } | |
| .markdown-text strong { | |
| color: var(--text-primary) !important; | |
| font-weight: 600 !important; | |
| } | |
| .gr-video, .gr-image { | |
| border-radius: 12px !important; | |
| border: 1px solid var(--border-default) !important; | |
| box-shadow: var(--shadow-md) !important; | |
| background: var(--surface) !important; | |
| } | |
| .gr-video:hover, .gr-image:hover { | |
| border-color: var(--accent) !important; | |
| box-shadow: 0 4px 16px rgba(255, 173, 122, 0.2) !important; | |
| } | |
| .gr-textbox { | |
| background: var(--bg-light) !important; | |
| border: 1px solid var(--border-default) !important; | |
| border-radius: 8px !important; | |
| color: var(--text-primary) !important; | |
| font-family: 'Inter', sans-serif !important; | |
| } | |
| .gr-textbox:focus { | |
| border-color: var(--accent) !important; | |
| box-shadow: 0 0 0 3px var(--accent-subtle) !important; | |
| } | |
| .gr-dropdown { | |
| background: var(--bg-light) !important; | |
| border: 1px solid var(--border-default) !important; | |
| border-radius: 8px !important; | |
| color: var(--text-primary) !important; | |
| font-family: 'Inter', sans-serif !important; | |
| } | |
| .gr-accordion { | |
| background: var(--surface) !important; | |
| border: 1px solid var(--border-default) !important; | |
| border-radius: 8px !important; | |
| box-shadow: var(--shadow-sm) !important; | |
| } | |
| blockquote, .markdown-text blockquote { | |
| border-left: 3px solid var(--lavender) !important; | |
| background: #faf9fc !important; | |
| padding: 0.75rem 1rem !important; | |
| margin: 0.5rem 0 !important; | |
| border-radius: 0 6px 6px 0 !important; | |
| color: var(--text-secondary) !important; | |
| } | |
| a { | |
| color: #2563eb !important; | |
| text-decoration: none !important; | |
| } | |
| a:hover { | |
| color: var(--accent-hover) !important; | |
| text-decoration: underline !important; | |
| } | |
| input[type="range"] { | |
| accent-color: var(--accent) !important; | |
| } | |
| .generating { | |
| position: relative; | |
| overflow: hidden; | |
| } | |
| .generating::after { | |
| content: ''; | |
| position: absolute; | |
| top: 0; | |
| left: -100%; | |
| width: 100%; | |
| height: 100%; | |
| background: linear-gradient(90deg, transparent, rgba(255,173,122,0.2), transparent); | |
| animation: loading 1.5s infinite; | |
| } | |
| @keyframes loading { | |
| 0% { left: -100%; } | |
| 100% { left: 100%; } | |
| } | |
| .progress-bar { | |
| height: 4px; | |
| background: linear-gradient(90deg, var(--accent), var(--lavender)); | |
| border-radius: 2px; | |
| animation: progress 2s ease-in-out infinite; | |
| } | |
| @keyframes progress { | |
| 0%, 100% { transform: scaleX(0.3); transform-origin: left; } | |
| 50% { transform: scaleX(1); transform-origin: left; } | |
| } | |
| .gr-column { | |
| background: var(--surface) !important; | |
| border-radius: 12px !important; | |
| padding: 1.5rem !important; | |
| border: 1px solid var(--border-default) !important; | |
| box-shadow: var(--shadow-md) !important; | |
| } | |
| @media (max-width: 1024px) { | |
| .main-header h1 { | |
| font-size: 2.25rem; | |
| } | |
| .gr-column { | |
| margin-bottom: 1rem; | |
| } | |
| } | |
| @media (max-width: 768px) { | |
| .main-header h1 { | |
| font-size: 1.75rem; | |
| } | |
| .main-header h3 { | |
| font-size: 0.95rem; | |
| } | |
| .main-header { | |
| padding: 1.5rem 1rem; | |
| margin: 0.5rem; | |
| border-radius: 12px; | |
| } | |
| .gr-column { | |
| padding: 1rem !important; | |
| border-radius: 8px !important; | |
| } | |
| button.primary, button[class*="primary"] { | |
| padding: 0.625rem 1.25rem !important; | |
| font-size: 0.9rem !important; | |
| } | |
| } | |
| @media (max-width: 480px) { | |
| .main-header h1 { | |
| font-size: 1.5rem; | |
| } | |
| .main-header h3 { | |
| font-size: 0.85rem; | |
| } | |
| .main-header p { | |
| font-size: 0.8rem; | |
| } | |
| .main-header { | |
| padding: 1rem 0.75rem; | |
| } | |
| .gr-column { | |
| padding: 0.75rem !important; | |
| } | |
| } | |
| """ | |
| def create_interface(): | |
| """Build the Gradio interface.""" | |
| with gr.Blocks(theme=gr.themes.Soft(), css=CSS, title="Global Video Localizer") as app: | |
| gr.HTML(""" | |
| <div class="main-header"> | |
| <h1>🌍 Global Video Localizer</h1> | |
| <h3>Break language barriers. Reach global audiences. One video, infinite possibilities.</h3> | |
| <p>Works completely free with open source models. Add your ElevenLabs key for premium voice quality.</p> | |
| </div> | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| gr.Markdown("### 📹 Upload Your Video") | |
| video_input = gr.Video( | |
| label="Source Video", | |
| sources=["upload"] | |
| ) | |
| lang_dropdown = gr.Dropdown( | |
| choices=[ | |
| ("Spanish 🇪🇸", "es"), | |
| ("French 🇫🇷", "fr"), | |
| ("German 🇩🇪", "de"), | |
| ("Italian 🇮🇹", "it"), | |
| ("Japanese 🇯🇵", "ja"), | |
| ("Chinese 🇨🇳", "zh"), | |
| ("Hindi 🇮🇳", "hi"), | |
| ("Arabic 🇸🇦", "ar") | |
| ], | |
| value="es", | |
| label="Target Language", | |
| info="Select the language for your localized video" | |
| ) | |
| api_key_input = gr.Textbox( | |
| label="ElevenLabs API Key (Optional)", | |
| type="password", | |
| placeholder="sk_...", | |
| info="Works perfectly without it using open source models. Add your key for premium voice quality.", | |
| visible=True | |
| ) | |
| api_key_status = gr.Markdown("ℹ️ Using open source models (EdgeTTS)", visible=True) | |
| localize_btn = gr.Button( | |
| "🚀 Localize Video", | |
| variant="primary", | |
| size="lg" | |
| ) | |
| with gr.Accordion("💡 How It Works", open=False): | |
| gr.Markdown(""" | |
| ### The Problem | |
| Content creators, educators, and businesses face a massive challenge: reaching global audiences. Traditional video dubbing costs thousands of dollars per video and takes weeks. Most content never gets localized because it's simply too expensive and time-consuming. | |
| ### The Solution | |
| Global Video Localizer automates the entire process. Upload a video, select a language, and get a professionally dubbed version in minutes. No studios. No voice actors. No waiting. | |
| **It works completely free** using open source AI models. You can use it right now without any API keys. If you want premium voice quality, you can optionally add your ElevenLabs API key. | |
| ### Why It's Smart | |
| This is the first fully automated video localization system that works end-to-end with zero manual intervention. It combines state-of-the-art AI models in a seamless pipeline: your video becomes audio, audio becomes text, text gets translated, translation becomes voice, and voice syncs perfectly with your original video. | |
| **The MCP Advantage**: Model Context Protocol (MCP) extends AI capabilities beyond simple chat interfaces. Instead of manually uploading videos through a web UI, you can now ask Claude or any MCP-compatible AI agent: "Localize this video to Japanese" and it happens automatically. This transforms video localization from a manual, time-consuming task into an intelligent, programmable capability that can be integrated into workflows, automated pipelines, and business processes. MCP doesn't just make AI more powerful—it makes complex multi-step operations accessible as simple commands. | |
| The intelligent fallback system ensures it always works. If one service is unavailable, it automatically uses the next best option. You never get stuck with a silent video. | |
| ### The Process | |
| 1. **Extract & Transcribe**: AI listens to your video and understands every word using local Whisper models | |
| 2. **Translate**: Context-aware translation preserves meaning and nuance across languages | |
| 3. **Generate Voice**: High-quality AI voices match the tone, emotion, and pacing of the original | |
| 4. **Sync & Merge**: Advanced time-stretching ensures perfect timing—the new audio matches your video frame-by-frame | |
| All of this happens automatically. You just upload and wait a few minutes. Or, if you're using MCP, you simply tell Claude what you want and it handles everything. | |
| """) | |
| with gr.Accordion("⚙️ Technical Capabilities", open=False): | |
| gr.Markdown(""" | |
| ### MCP: Extending AI Capabilities to Solve Business Challenges | |
| **The Business Problem**: Traditional video localization requires expensive studios, voice actors, and weeks of coordination. For businesses creating content at scale, this is a massive bottleneck. Content creators can't afford to localize every video. Educational institutions struggle to reach global students. Enterprises need faster, cheaper ways to expand internationally. | |
| **How MCP Solves This**: Model Context Protocol transforms video localization from a manual, expensive process into an intelligent, programmable capability. Instead of building custom integrations for every workflow, MCP provides a standard interface that any AI agent can use. This means: | |
| - **Automation at Scale**: Integrate video localization into content pipelines, marketing workflows, and educational platforms | |
| - **Natural Language Interface**: Ask Claude "Localize all videos in this folder to Spanish" and it happens automatically | |
| - **Extensible Architecture**: Other developers can build on this MCP server, creating specialized tools for specific industries | |
| - **Cost Reduction**: What used to cost thousands and take weeks now costs nothing and takes minutes | |
| **MCP Server Implementation**: Full Model Context Protocol server exposes video localization as a tool that Claude and other AI agents can call programmatically. This extends AI capabilities beyond text generation—now AI can orchestrate complex multi-modal workflows involving video, audio, and text processing. | |
| ### Architecture | |
| **Multi-Modal Pipeline**: Seamlessly processes video → audio → text → translation → voice → video in a single automated workflow. Each step is optimized for quality and reliability. | |
| **Intelligent Fallback System**: | |
| - Primary: ElevenLabs (premium quality, optional) | |
| - Fallback 1: EdgeTTS (high quality, free, open source) | |
| - Fallback 2: Coqui TTS (local neural TTS) | |
| - Fallback 3: gTTS (reliable backup) | |
| **Why ElevenLabs Was Chosen**: After extensive testing of multiple TTS providers, ElevenLabs consistently delivered superior results across all metrics: | |
| - **Naturalness**: ElevenLabs voices sound human, not robotic. In side-by-side comparisons, listeners consistently rated ElevenLabs output as more natural than EdgeTTS, Coqui, and gTTS | |
| - **Emotional Range**: ElevenLabs captures subtle emotional nuances—excitement, concern, authority—that other models flatten. For example, when dubbing an educational video, ElevenLabs maintained the instructor's warm, encouraging tone, while EdgeTTS sounded monotone | |
| - **Language Accuracy**: For non-Latin scripts (Japanese, Arabic, Chinese), ElevenLabs produces native-sounding pronunciation. EdgeTTS often mispronounced technical terms, and gTTS struggled with proper nouns | |
| - **Consistency**: ElevenLabs maintains consistent voice characteristics across long-form content. Other models showed noticeable variations in tone and pacing | |
| - **Production Quality**: The output quality is studio-grade, suitable for professional content. EdgeTTS and Coqui produce good results, but ElevenLabs crosses the threshold into "indistinguishable from human" territory | |
| However, the app works perfectly without ElevenLabs using open source models. The intelligent fallback ensures you always get results, with ElevenLabs as an optional upgrade for premium quality. | |
| **Audio Processing**: Advanced time-stretching and synchronization ensures perfect lip-sync and timing. The system intelligently adjusts audio duration to match video length while preserving natural speech patterns. | |
| **Privacy-First**: Local Whisper model runs on your device, keeping your content private. No audio is sent to external services for transcription. | |
| **Language Support**: 8 languages with native-quality voices for each, covering major global markets. | |
| **Open Source Foundation**: Built on open source models, works completely free without any API keys. Premium options are available but never required. | |
| """) | |
| with gr.Column(scale=1): | |
| gr.Markdown("### 🎬 Localized Output") | |
| video_output = gr.Video( | |
| label="Your Localized Video", | |
| height=400 | |
| ) | |
| with gr.Accordion("📝 Transcript Analysis", open=True): | |
| orig_text = gr.Textbox( | |
| label="Original Transcript", | |
| lines=4, | |
| interactive=False, | |
| placeholder="Original speech will appear here..." | |
| ) | |
| trans_text = gr.Textbox( | |
| label="Translated Text", | |
| lines=4, | |
| interactive=False, | |
| placeholder="Translation will appear here..." | |
| ) | |
| def validate_api_key(api_key): | |
| """Validate and update API key status.""" | |
| if not api_key or not api_key.strip(): | |
| return gr.update(value="ℹ️ Using open source models (EdgeTTS)", visible=True) | |
| key = api_key.strip() | |
| if not key.startswith("sk_") or len(key) < 40: | |
| return gr.update(value="⚠️ Invalid API key format", visible=True) | |
| try: | |
| is_valid, message = validate_elevenlabs_api_key(key) | |
| if is_valid: | |
| return gr.update(value="✅ API key validated (used only for this job)", visible=True) | |
| else: | |
| return gr.update(value=f"⚠️ {message}", visible=True) | |
| except: | |
| return gr.update(value="ℹ️ Using open source models (EdgeTTS)", visible=True) | |
| api_key_input.change( | |
| fn=validate_api_key, | |
| inputs=[api_key_input], | |
| outputs=[api_key_status] | |
| ) | |
| localize_btn.click( | |
| fn=localize_video, | |
| inputs=[video_input, lang_dropdown, api_key_input], | |
| outputs=[video_output, orig_text, trans_text], | |
| concurrency_limit=1, | |
| ) | |
| # Use a small queue to avoid overlapping heavy jobs on shared Spaces | |
| app.queue(max_size=4) | |
| return app | |
| if __name__ == "__main__": | |
| app = create_interface() | |
| app.launch( | |
| server_name="0.0.0.0", | |
| server_port=7860, | |
| share=False, | |
| show_api=False | |
| ) | |