MuseTalk

Running

App Files Files Community

Ultronprime commited on Apr 6

Commit

b9a578a

verified ·

1 Parent(s): 356d0ea

Update inference.py

Browse files

Files changed (1) hide show

inference.py +111 -194

inference.py CHANGED Viewed

@@ -1,7 +1,7 @@
 """MuseTalk Inference Module
-This module provides the core inference functionality for MuseTalk,
-enabling audio-driven lip-sync video generation.
 """
 import os
@@ -9,20 +9,17 @@ import cv2
 import torch
 import numpy as np
 import tempfile
 from pathlib import Path
 from typing import Optional, Tuple, Union
-import subprocess
 class MuseTalkInference:
     """MuseTalk inference engine for audio-driven video generation."""
     def __init__(self, device: str = "cuda" if torch.cuda.is_available() else "cpu"):
-        """Initialize MuseTalk inference engine.
-        Args:
-            device: torch device to use ('cuda' or 'cpu')
-        """
         self.device = device
         self.model = None
         self.whisper_model = None
@@ -31,45 +28,30 @@ class MuseTalkInference:
         self.initialized = False
     def load_models(self, progress_callback=None):
-        """Load MuseTalk models from HuggingFace Hub.
-        Args:
-            progress_callback: Optional callback to report loading progress
-        """
         try:
             if progress_callback:
                 progress_callback(0, "Loading MuseTalk models...")
-            # For now, return success - models will be loaded lazily during inference
             self.initialized = True
             if progress_callback:
-                progress_callback(100, "Models loaded successfully")
         except Exception as e:
             print(f"Error loading models: {e}")
             raise
     def extract_audio_features(self, audio_path: str, progress_callback=None) -> np.ndarray:
-        """Extract audio features using Whisper.
-        Args:
-            audio_path: Path to audio file
-            progress_callback: Optional progress callback
-        Returns:
-            Audio features array
-        """
         try:
             if progress_callback:
                 progress_callback(10, "Extracting audio features...")
-            # Load audio file
             try:
-                import librosa
                 audio, sr = librosa.load(audio_path, sr=16000)
             except:
-                # Fallback using scipy
                 try:
                     import scipy.io.wavfile as wavfile
                     sr, audio = wavfile.read(audio_path)
@@ -77,24 +59,20 @@ class MuseTalkInference:
                         ratio = 16000 / sr
                         audio = (audio * ratio).astype(np.int16)
                 except:
-                    # Additional fallback
                     import soundfile as sf
                     audio, sr = sf.read(audio_path)
-            # Normalize audio
             audio = audio.astype(np.float32)
             audio = audio / (np.max(np.abs(audio)) + 1e-8)
-            # Create feature representation (mel-spectrogram)
             n_mels = 80
             n_fft = 400
             hop_length = 160
-            # Simple mel-spectrogram computation
             mel_features = self._compute_mel_spectrogram(audio, sr, n_mels, n_fft, hop_length)
             if progress_callback:
-                progress_callback(30, "Audio features extracted")
             return mel_features
@@ -102,42 +80,36 @@ class MuseTalkInference:
             print(f"Error extracting audio features: {e}")
             raise
-    def extract_video_frames(self, video_path: str, fps: int = 25, progress_callback=None) -> Tuple[list, int, int]:
-        """Extract frames from video file.
-        Args:
-            video_path: Path to video file
-            fps: Target fps for extraction
-            progress_callback: Optional progress callback
-        Returns:
-            Tuple of (frames list, width, height)
-        """
         try:
             if progress_callback:
-                progress_callback(10, "Extracting video frames...")
-            cap = cv2.VideoCapture(video_path)
             frames = []
-            frame_count = 0
-            while True:
-                ret, frame = cap.read()
-                if not ret:
-                    break
                 frames.append(frame)
-                frame_count += 1
-            cap.release()
             if not frames:
-                raise ValueError("No frames extracted from video")
             height, width = frames[0].shape[:2]
-            if progress_callback:
-                progress_callback(30, f"Extracted {len(frames)} frames")
             return frames, width, height
         except Exception as e:
@@ -145,22 +117,12 @@ class MuseTalkInference:
             raise
     def detect_faces(self, frames: list, progress_callback=None) -> list:
-        """Detect faces in video frames.
-        Args:
-            frames: List of video frames
-            progress_callback: Optional progress callback
-        Returns:
-            List of face bounding boxes for each frame
-        """
         try:
             if progress_callback:
-                progress_callback(40, "Detecting faces in frames...")
             face_detections = []
-            # Use OpenCV's Haar Cascade for face detection
             cascade_path = cv2.data.haarcascades + 'haarcascade_frontalface_default.xml'
             face_cascade = cv2.CascadeClassifier(cascade_path)
@@ -169,145 +131,112 @@ class MuseTalkInference:
                 faces = face_cascade.detectMultiScale(gray, 1.1, 4)
                 if len(faces) > 0:
-                    # Take the largest face
                     face = max(faces, key=lambda f: f[2] * f[3])
                     face_detections.append(face)
                 else:
-                    # Use previous face detection or frame dimensions
                     if face_detections:
                         face_detections.append(face_detections[-1])
                     else:
                         h, w = frame.shape[:2]
                         face_detections.append(np.array([w//4, h//4, w//2, h//2]))
-                if (i + 1) % max(1, len(frames) // 10) == 0 and progress_callback:
-                    progress_callback(40 + int((i + 1) / len(frames) * 20), f"Detected faces: {i + 1}/{len(frames)}")
             return face_detections
         except Exception as e:
             print(f"Error detecting faces: {e}")
             raise
-    def generate_lipsync(self, frames: list, audio_features: np.ndarray, face_detections: list,
-                        progress_callback=None) -> list:
-        """Generate lip-sync frames.
-        Args:
-            frames: List of original video frames
-            audio_features: Audio feature array
-            face_detections: List of face bounding boxes
-            progress_callback: Optional progress callback
-        Returns:
-            List of lip-synced frames
         """
         try:
-            if progress_callback:
-                progress_callback(60, "Generating lip-sync...")
-            lipsync_frames = []
-            # For now, return frames with marked regions (placeholder for actual inference)
-            for i, frame in enumerate(frames):
-                output_frame = frame.copy()
-                if i < len(face_detections):
-                    face = face_detections[i]
-                    x, y, w, h = int(face[0]), int(face[1]), int(face[2]), int(face[3])
-                    # Draw rectangle around detected face region
-                    cv2.rectangle(output_frame, (x, y), (x + w, y + h), (0, 255, 0), 2)
-                lipsync_frames.append(output_frame)
-                if (i + 1) % max(1, len(frames) // 10) == 0 and progress_callback:
-                    progress_callback(60 + int((i + 1) / len(frames) * 20), f"Lip-sync frames: {i + 1}/{len(frames)}")
-            return lipsync_frames
-        except Exception as e:
-            print(f"Error generating lip-sync: {e}")
-            raise
-    def save_output_video(self, frames: list, output_path: str, fps: int = 25, progress_callback=None) -> str:
-        """Save generated frames as video file.
-        Args:
-            frames: List of output frames
-            output_path: Path to save output video
-            fps: Frames per second for output video
-            progress_callback: Optional progress callback
-        Returns:
-            Path to saved video file
-        """
-        try:
-            if progress_callback:
-                progress_callback(80, "Encoding video...")
-            if not frames:
-                raise ValueError("No frames to save")
-            height, width = frames[0].shape[:2]
-            # Use OpenCV VideoWriter
             fourcc = cv2.VideoWriter_fourcc(*'mp4v')
-            out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
-            for i, frame in enumerate(frames):
                 out.write(frame)
-                if (i + 1) % max(1, len(frames) // 10) == 0 and progress_callback:
-                    progress_callback(80 + int((i + 1) / len(frames) * 15), f"Encoding: {i + 1}/{len(frames)}")
             out.release()
             if progress_callback:
-                progress_callback(95, "Video encoding complete")
-            return output_path
-        except Exception as e:
-            print(f"Error saving video: {e}")
-            raise
-    def generate(self, audio_path: str, video_path: str, output_path: str,
-                 fps: int = 25, progress_callback=None) -> str:
-        """Generate lip-synced video from audio and video.
-        Args:
-            audio_path: Path to input audio file
-            video_path: Path to input video file
-            output_path: Path to save output video
-            fps: Target fps for output
-            progress_callback: Optional progress callback
-        Returns:
-            Path to generated video
-        """
-        try:
-            # Initialize models if not already done
-            if not self.initialized:
-                self.load_models(progress_callback)
-            # Extract audio features
-            audio_features = self.extract_audio_features(audio_path, progress_callback)
-            # Extract video frames
-            frames, width, height = self.extract_video_frames(video_path, fps, progress_callback)
-            # Detect faces
-            face_detections = self.detect_faces(frames, progress_callback)
-            # Generate lip-sync
-            output_frames = self.generate_lipsync(frames, audio_features, face_detections, progress_callback)
-            # Save output video
-            result_path = self.save_output_video(output_frames, output_path, fps, progress_callback)
             if progress_callback:
-                progress_callback(100, "Lip-sync generation complete!")
-            return result_path
         except Exception as e:
             print(f"Error during generation: {e}")
@@ -315,18 +244,7 @@ class MuseTalkInference:
     def _compute_mel_spectrogram(self, audio: np.ndarray, sr: int, n_mels: int,
                                 n_fft: int, hop_length: int) -> np.ndarray:
-        """Compute mel-spectrogram from audio.
-        Args:
-            audio: Audio signal
-            sr: Sample rate
-            n_mels: Number of mel bins
-            n_fft: FFT window size
-            hop_length: Hop length
-        Returns:
-            Mel-spectrogram array
-        """
         try:
             import librosa
             mel_spec = librosa.feature.melspectrogram(y=audio, sr=sr, n_fft=n_fft,
@@ -334,6 +252,5 @@ class MuseTalkInference:
             mel_spec = librosa.power_to_db(mel_spec, ref=np.max)
             return mel_spec
         except:
-            # Fallback: return a dummy feature array
             n_frames = len(audio) // hop_length
             return np.random.randn(n_mels, n_frames)

 """MuseTalk Inference Module
+Refactored for Long-Form Generation (5-10 mins)
+using Memory-Efficient Streaming, Looping, and Audio Muxing.
 """
 import os
 import torch
 import numpy as np
 import tempfile
+import librosa
+import mimetypes
+import subprocess
 from pathlib import Path
 from typing import Optional, Tuple, Union
 class MuseTalkInference:
     """MuseTalk inference engine for audio-driven video generation."""
     def __init__(self, device: str = "cuda" if torch.cuda.is_available() else "cpu"):
         self.device = device
         self.model = None
         self.whisper_model = None
         self.initialized = False
     def load_models(self, progress_callback=None):
+        """Load MuseTalk models from HuggingFace Hub."""
         try:
             if progress_callback:
                 progress_callback(0, "Loading MuseTalk models...")
+            # Placeholder: Initialize your actual PyTorch models here
             self.initialized = True
             if progress_callback:
+                progress_callback(5, "Models loaded successfully")
         except Exception as e:
             print(f"Error loading models: {e}")
             raise
     def extract_audio_features(self, audio_path: str, progress_callback=None) -> np.ndarray:
+        """Extract audio features using Whisper/Mel-Spectrogram."""
         try:
             if progress_callback:
                 progress_callback(10, "Extracting audio features...")
             try:
                 audio, sr = librosa.load(audio_path, sr=16000)
             except:
                 try:
                     import scipy.io.wavfile as wavfile
                     sr, audio = wavfile.read(audio_path)
                         ratio = 16000 / sr
                         audio = (audio * ratio).astype(np.int16)
                 except:
                     import soundfile as sf
                     audio, sr = sf.read(audio_path)
             audio = audio.astype(np.float32)
             audio = audio / (np.max(np.abs(audio)) + 1e-8)
             n_mels = 80
             n_fft = 400
             hop_length = 160
             mel_features = self._compute_mel_spectrogram(audio, sr, n_mels, n_fft, hop_length)
             if progress_callback:
+                progress_callback(15, "Audio features extracted")
             return mel_features
             print(f"Error extracting audio features: {e}")
             raise
+    def extract_source_frames(self, file_path: str, fps: int = 25, progress_callback=None) -> Tuple[list, int, int]:
+        """Extracts frames from a short video or loads a single image to memory."""
         try:
             if progress_callback:
+                progress_callback(20, "Reading source image/video...")
+            mime_type, _ = mimetypes.guess_type(file_path)
             frames = []
+            # Handle Single Image Input
+            if mime_type and mime_type.startswith('image'):
+                frame = cv2.imread(file_path)
+                if frame is None:
+                    raise ValueError("Failed to read image")
                 frames.append(frame)
+            # Handle Short Video Input
+            else:
+                cap = cv2.VideoCapture(file_path)
+                while True:
+                    ret, frame = cap.read()
+                    if not ret:
+                        break
+                    frames.append(frame)
+                cap.release()
             if not frames:
+                raise ValueError("No frames extracted from source file")
             height, width = frames[0].shape[:2]
             return frames, width, height
         except Exception as e:
             raise
     def detect_faces(self, frames: list, progress_callback=None) -> list:
+        """Detect faces ONLY on the short source clip to save compute."""
         try:
             if progress_callback:
+                progress_callback(25, "Detecting face in source media...")
             face_detections = []
             cascade_path = cv2.data.haarcascades + 'haarcascade_frontalface_default.xml'
             face_cascade = cv2.CascadeClassifier(cascade_path)
                 faces = face_cascade.detectMultiScale(gray, 1.1, 4)
                 if len(faces) > 0:
+                    # Take the LARGEST face by area (width * height)
                     face = max(faces, key=lambda f: f[2] * f[3])
                     face_detections.append(face)
                 else:
                     if face_detections:
                         face_detections.append(face_detections[-1])
                     else:
                         h, w = frame.shape[:2]
                         face_detections.append(np.array([w//4, h//4, w//2, h//2]))
             return face_detections
         except Exception as e:
             print(f"Error detecting faces: {e}")
             raise
+    def generate(self, audio_path: str, video_path: str, output_path: str,
+                 fps: int = 25, progress_callback=None) -> str:
+        """
+        Memory-efficient generator for long videos.
+        Loops short inputs to match 5-10 minute audio.
         """
         try:
+            if not self.initialized:
+                self.load_models(progress_callback)
+            # 1. Extract audio features
+            audio_features = self.extract_audio_features(audio_path, progress_callback)
+            # 2. Determine Total Output Frames based on Audio Length
+            audio_data, sr = librosa.load(audio_path, sr=16000)
+            audio_duration = len(audio_data) / sr
+            total_target_frames = int(audio_duration * fps)
+            if total_target_frames == 0:
+                raise ValueError("Audio file is too short or invalid.")
+            # 3. Extract Source Clip/Image (Only loads short clip into memory)
+            source_frames, width, height = self.extract_source_frames(video_path, fps, progress_callback)
+            # 4. Detect faces on the short source clip (Pre-cached)
+            source_faces = self.detect_faces(source_frames, progress_callback)
+            # 5. Stream Process (Write directly to file to avoid OOM crash)
+            temp_silent_video = output_path.replace('.mp4', '_silent.mp4')
             fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+            out = cv2.VideoWriter(temp_silent_video, fourcc, fps, (width, height))
+            if progress_callback:
+                progress_callback(30, f"Generating {total_target_frames} frames (Streaming)...")
+            for i in range(total_target_frames):
+                # LOOPING LOGIC: Loop the short video or image continuously
+                src_idx = i % len(source_frames)
+                frame = source_frames[src_idx].copy()
+                face = source_faces[src_idx]
+                # --- START AI LIP-SYNC INFERENCE ---
+                # NOTE: Put your actual AI model generation code here.
+                # Right now, this just draws a box around the face.
+                # Example: frame = self.model.infer(frame, face, audio_features[:, i])
+                x, y, w, h = int(face[0]), int(face[1]), int(face[2]), int(face[3])
+                cv2.rectangle(frame, (x, y), (x + w, y + h), (0, 255, 0), 2)
+                # --- END AI LIP-SYNC INFERENCE ---
+                # Write directly to disk (Saves 30GB+ of RAM for 10 min videos)
                 out.write(frame)
+                # Report progress periodically
+                if (i + 1) % max(1, total_target_frames // 20) == 0 and progress_callback:
+                    progress_pct = 30 + int((i / total_target_frames) * 60)
+                    progress_callback(progress_pct, f"Generated frames: {i + 1}/{total_target_frames}")
             out.release()
+            # 6. MUX AUDIO (Combine the generated silent video with original audio)
             if progress_callback:
+                progress_callback(95, "Merging final audio and video...")
+            try:
+                cmd = [
+                    "ffmpeg", "-y",
+                    "-i", temp_silent_video,   # The generated silent video
+                    "-i", audio_path,          # The original audio
+                    "-c:v", "libx264",         # Re-encode video for broad web compatibility
+                    "-c:a", "aac",             # Re-encode audio to AAC
+                    "-map", "0:v:0",
+                    "-map", "1:a:0",
+                    "-shortest",               # Cut at the shortest stream
+                    output_path
+                ]
+                subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+                # Cleanup temp file
+                if os.path.exists(temp_silent_video):
+                    os.remove(temp_silent_video)
+            except subprocess.CalledProcessError as e:
+                print(f"FFMPEG Error: {e.stderr}")
+                # Fallback to silent video if FFMPEG fails
+                os.rename(temp_silent_video, output_path)
             if progress_callback:
+                progress_callback(100, "Generation Complete!")
+            return output_path
         except Exception as e:
             print(f"Error during generation: {e}")
     def _compute_mel_spectrogram(self, audio: np.ndarray, sr: int, n_mels: int,
                                 n_fft: int, hop_length: int) -> np.ndarray:
+        """Compute mel-spectrogram from audio."""
         try:
             import librosa
             mel_spec = librosa.feature.melspectrogram(y=audio, sr=sr, n_fft=n_fft,
             mel_spec = librosa.power_to_db(mel_spec, ref=np.max)
             return mel_spec
         except:
             n_frames = len(audio) // hop_length
             return np.random.randn(n_mels, n_frames)