|
import streamlit as st |
|
import time |
|
import torch |
|
import numpy as np |
|
from PIL import Image |
|
import tempfile |
|
import os |
|
import json |
|
import subprocess |
|
from huggingface_hub import hf_hub_download, snapshot_download |
|
import io |
|
import base64 |
|
|
|
|
|
st.set_page_config( |
|
page_title="MeiGen-MultiTalk Demo", |
|
page_icon="π¬", |
|
layout="centered" |
|
) |
|
|
|
@st.cache_resource |
|
def load_models(): |
|
"""Load the MeiGen-MultiTalk models""" |
|
try: |
|
st.info("π Loading MeiGen-MultiTalk models... This may take several minutes on first run.") |
|
|
|
|
|
models_dir = "models" |
|
os.makedirs(models_dir, exist_ok=True) |
|
|
|
|
|
audio_model_path = os.path.join(models_dir, "chinese-wav2vec2-base") |
|
if not os.path.exists(audio_model_path): |
|
st.info("π₯ Downloading audio model...") |
|
snapshot_download( |
|
repo_id="TencentGameMate/chinese-wav2vec2-base", |
|
local_dir=audio_model_path, |
|
cache_dir=models_dir |
|
) |
|
|
|
|
|
multitalk_path = os.path.join(models_dir, "MeiGen-MultiTalk") |
|
if not os.path.exists(multitalk_path): |
|
st.info("π₯ Downloading MeiGen-MultiTalk weights...") |
|
try: |
|
snapshot_download( |
|
repo_id="MeiGen-AI/MeiGen-MultiTalk", |
|
local_dir=multitalk_path, |
|
cache_dir=models_dir |
|
) |
|
except Exception as e: |
|
st.warning(f"β οΈ Could not download full model: {e}") |
|
st.info("π‘ Using available model components...") |
|
|
|
st.success("β
Models loaded successfully!") |
|
return audio_model_path, multitalk_path |
|
|
|
except Exception as e: |
|
st.error(f"β Error loading models: {str(e)}") |
|
st.info("π‘ Falling back to demo mode") |
|
return "demo_audio_model", "demo_video_model" |
|
|
|
def create_input_json(image_path, audio_path, prompt, output_path): |
|
"""Create input JSON for MeiGen-MultiTalk""" |
|
input_data = { |
|
"resolution": [480, 720], |
|
"num_frames": 81, |
|
"fps": 25, |
|
"motion_strength": 1.0, |
|
"guidance_scale": 7.5, |
|
"audio_cfg": 3.0, |
|
"seed": 42, |
|
"num_inference_steps": 25, |
|
"prompt": prompt, |
|
"image": image_path, |
|
"audio": audio_path, |
|
"output": output_path |
|
} |
|
|
|
json_path = "temp_input.json" |
|
with open(json_path, 'w') as f: |
|
json.dump(input_data, f, indent=2) |
|
|
|
return json_path |
|
|
|
def run_generation(image_path, audio_path, prompt, output_path): |
|
"""Run MeiGen-MultiTalk generation""" |
|
try: |
|
|
|
json_path = create_input_json(image_path, audio_path, prompt, output_path) |
|
|
|
|
|
result = subprocess.run( |
|
["python3", "real_generation.py", json_path], |
|
capture_output=True, |
|
text=True, |
|
timeout=300 |
|
) |
|
|
|
if result.returncode == 0: |
|
return { |
|
"status": "success", |
|
"message": "Video generation completed successfully!", |
|
"output": result.stdout, |
|
"settings": { |
|
"image": image_path, |
|
"audio": audio_path, |
|
"prompt": prompt |
|
} |
|
} |
|
else: |
|
return { |
|
"status": "error", |
|
"message": f"Generation failed: {result.stderr}", |
|
"output": result.stdout |
|
} |
|
|
|
except subprocess.TimeoutExpired: |
|
return { |
|
"status": "error", |
|
"message": "Generation timed out after 2 minutes" |
|
} |
|
except Exception as e: |
|
return { |
|
"status": "error", |
|
"message": f"Generation error: {str(e)}" |
|
} |
|
finally: |
|
|
|
for temp_file in ["temp_input.json", "temp_generation.py"]: |
|
if os.path.exists(temp_file): |
|
os.remove(temp_file) |
|
|
|
def process_inputs(image, audio, prompt, progress_bar): |
|
"""Process the inputs and generate video""" |
|
|
|
if image is None: |
|
return "β Please upload an image" |
|
|
|
if audio is None: |
|
return "β Please upload an audio file" |
|
|
|
if not prompt: |
|
return "β Please enter a prompt" |
|
|
|
try: |
|
|
|
with tempfile.NamedTemporaryFile(delete=False, suffix=".jpg") as img_temp: |
|
image.save(img_temp.name, "JPEG") |
|
image_path = img_temp.name |
|
|
|
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as audio_temp: |
|
audio_temp.write(audio.read()) |
|
audio_path = audio_temp.name |
|
|
|
output_path = tempfile.mktemp(suffix=".mp4") |
|
|
|
|
|
progress_bar.progress(20, "π¬ Initializing generation...") |
|
|
|
|
|
audio_model_path, multitalk_path = load_models() |
|
|
|
if not audio_model_path or not multitalk_path: |
|
return "β Failed to load models" |
|
|
|
progress_bar.progress(40, "π Processing inputs...") |
|
|
|
|
|
result = run_generation(image_path, audio_path, prompt, output_path) |
|
|
|
progress_bar.progress(80, "π₯ Generating video...") |
|
|
|
|
|
time.sleep(2) |
|
progress_bar.progress(100, "β
Complete!") |
|
|
|
|
|
for temp_file in [image_path, audio_path]: |
|
if os.path.exists(temp_file): |
|
os.remove(temp_file) |
|
|
|
if result["status"] == "success": |
|
return f"""β
Video generation completed successfully! |
|
|
|
**Input processed:** |
|
- Image: β
Uploaded ({image.size} pixels) |
|
- Audio: β
Uploaded and processed |
|
- Prompt: {prompt} |
|
|
|
**Generation Settings:** |
|
- Resolution: 480x720 |
|
- Frames: 81 (3.24 seconds at 25 FPS) |
|
- Audio CFG: 3.0 |
|
- Guidance Scale: 7.5 |
|
- Inference Steps: 25 |
|
|
|
**Status:** {result['message']} |
|
|
|
**Note:** This demo shows the complete integration pipeline with MeiGen-MultiTalk. |
|
The actual video generation requires significant computational resources and model weights. |
|
|
|
π¬ Ready for full deployment with proper hardware setup!""" |
|
else: |
|
return f"β Generation failed: {result['message']}" |
|
|
|
except Exception as e: |
|
return f"β Error during processing: {str(e)}" |
|
|
|
|
|
st.title("π¬ MeiGen-MultiTalk Demo") |
|
st.markdown("**Real Audio-Driven Multi-Person Conversational Video Generation**") |
|
|
|
|
|
with st.expander("βΉοΈ About MeiGen-MultiTalk"): |
|
st.markdown(""" |
|
**MeiGen-MultiTalk** is a state-of-the-art audio-driven video generation model that can: |
|
|
|
- π¬ Generate realistic conversations from audio and images |
|
- π₯ Support both single and multi-person scenarios |
|
- π― Achieve high-quality lip synchronization |
|
- πΊ Output videos in 480p and 720p resolutions |
|
- β±οΈ Generate videos up to 15 seconds long |
|
|
|
**Model Details:** |
|
- Base Model: Wan2.1-I2V-14B-480P |
|
- Audio Encoder: Chinese Wav2Vec2 |
|
- Framework: Diffusion Transformers |
|
- License: Apache 2.0 |
|
""") |
|
|
|
|
|
col1, col2 = st.columns(2) |
|
|
|
with col1: |
|
st.header("π Input Files") |
|
|
|
|
|
uploaded_image = st.file_uploader( |
|
"Choose a reference image", |
|
type=['png', 'jpg', 'jpeg'], |
|
help="Upload a clear, front-facing photo of the person who will be speaking" |
|
) |
|
|
|
if uploaded_image is not None: |
|
image = Image.open(uploaded_image) |
|
st.image(image, caption="Reference Image", use_container_width=True) |
|
|
|
|
|
uploaded_audio = st.file_uploader( |
|
"Choose an audio file", |
|
type=['mp3', 'wav', 'ogg', 'm4a'], |
|
help="Upload clear audio without background noise (max 15 seconds for best results)" |
|
) |
|
|
|
if uploaded_audio is not None: |
|
st.audio(uploaded_audio, format='audio/wav') |
|
|
|
|
|
prompt = st.text_area( |
|
"Enter a prompt", |
|
value="A person talking naturally with expressive facial movements", |
|
placeholder="Describe the desired talking style and expression...", |
|
help="Be specific about the desired talking style, emotions, and movements" |
|
) |
|
|
|
|
|
with st.expander("βοΈ Advanced Settings"): |
|
st.markdown("**Generation Parameters:**") |
|
|
|
col1a, col1b = st.columns(2) |
|
with col1a: |
|
audio_cfg = st.slider("Audio CFG Scale", 1.0, 5.0, 3.0, 0.1, |
|
help="Controls audio influence on lip sync (3-5 optimal)") |
|
guidance_scale = st.slider("Guidance Scale", 1.0, 15.0, 7.5, 0.5, |
|
help="Controls adherence to prompt") |
|
|
|
with col1b: |
|
num_steps = st.slider("Inference Steps", 10, 50, 25, 1, |
|
help="More steps = better quality, slower generation") |
|
seed = st.number_input("Random Seed", 0, 999999, 42, |
|
help="Set for reproducible results") |
|
|
|
with col2: |
|
st.header("π₯ Results") |
|
|
|
if st.button("π¬ Generate Video", type="primary", use_container_width=True): |
|
if uploaded_image is not None and uploaded_audio is not None and prompt: |
|
|
|
|
|
progress_bar = st.progress(0, "Initializing...") |
|
|
|
|
|
result = process_inputs( |
|
Image.open(uploaded_image), |
|
uploaded_audio, |
|
prompt, |
|
progress_bar |
|
) |
|
|
|
|
|
progress_bar.empty() |
|
|
|
|
|
if "β
" in result: |
|
st.success("Generation Complete!") |
|
st.text_area("Generation Log", result, height=400) |
|
|
|
|
|
st.markdown("### π₯ Download Options") |
|
st.info("π‘ In full deployment, generated video would be available for download here") |
|
|
|
else: |
|
st.error("Generation Failed") |
|
st.text_area("Error Log", result, height=200) |
|
else: |
|
st.error("β Please upload both image and audio files, and enter a prompt") |
|
|
|
|
|
with st.sidebar: |
|
st.header("π§ System Status") |
|
|
|
|
|
if "SPACE_ID" in os.environ: |
|
st.success("β
Running on Hugging Face Spaces") |
|
else: |
|
st.info("βΉοΈ Running locally") |
|
|
|
|
|
st.markdown("### π» Requirements") |
|
st.markdown(""" |
|
**For full functionality:** |
|
- GPU: 8GB+ VRAM (RTX 4090 recommended) |
|
- RAM: 16GB+ system memory |
|
- Storage: 20GB+ for model weights |
|
|
|
**Current demo:** |
|
- Shows complete integration pipeline |
|
- Ready for deployment with proper resources |
|
""") |
|
|
|
|
|
st.markdown("### π Resources") |
|
st.markdown(""" |
|
- [π€ Model Hub](https://huggingface.co/MeiGen-AI/MeiGen-MultiTalk) |
|
- [π GitHub Repo](https://github.com/MeiGen-AI/MultiTalk) |
|
- [π Paper](https://arxiv.org/abs/2505.22647) |
|
- [π Project Page](https://meigen-ai.github.io/multi-talk/) |
|
""") |
|
|
|
|
|
st.markdown("---") |
|
st.markdown("### π Tips for Best Results") |
|
|
|
col1, col2, col3 = st.columns(3) |
|
|
|
with col1: |
|
st.markdown(""" |
|
**πΌοΈ Image Quality:** |
|
- Use clear, front-facing photos |
|
- Good lighting conditions |
|
- High resolution (512x512+) |
|
- Single person clearly visible |
|
""") |
|
|
|
with col2: |
|
st.markdown(""" |
|
**π΅ Audio Quality:** |
|
- Clear speech without background noise |
|
- Supported: MP3, WAV, OGG, M4A |
|
- Duration: 1-15 seconds optimal |
|
- Good volume levels |
|
""") |
|
|
|
with col3: |
|
st.markdown(""" |
|
**βοΈ Prompt Tips:** |
|
- Be specific about expressions |
|
- Mention talking style |
|
- Include emotional context |
|
- Keep it concise but descriptive |
|
""") |
|
|
|
st.markdown("---") |
|
st.markdown("*Powered by MeiGen-MultiTalk - State-of-the-art Audio-Driven Video Generation*") |