Spaces:

devangoyal
/

EVA-multimodal-video-sentiment-analysis-model

Running

App Files Files Community

devangoyal commited on Jun 24

Commit

d898dac

1 Parent(s): 49f7b31

Deploy

Browse files

Files changed (5) hide show

app.py +32 -0
inference.py +314 -0
models.py +137 -0
requirements.txt +13 -0
saved_models/checkpoint.pth +3 -0

app.py ADDED Viewed

	@@ -0,0 +1,32 @@

+import gradio as gr
+from inference import model_fn, predict_fn
+import os
+model_dict = model_fn(".")
+def predict(video):
+    temp_path = "temp.mp4"
+    try:
+        # Gradio gives a file path for video input
+        if isinstance(video, str) and os.path.exists(video):
+            os.rename(video, temp_path)
+        else:
+            with open(temp_path, "wb") as f:
+                f.write(video.read())
+        input_data = {"video_path": temp_path}
+        result = predict_fn(input_data, model_dict)
+        return result
+    finally:
+        if os.path.exists(temp_path):
+            os.remove(temp_path)
+demo = gr.Interface(
+    fn=predict,
+    inputs=gr.Video(type="filepath"),
+    outputs="json",
+    title="Video Sentiment Analysis",
+    description="Upload an .mp4 video to get sentiment and emotion predictions for each utterance."
+)
+if __name__ == "__main__":
+    demo.launch()

inference.py ADDED Viewed

	@@ -0,0 +1,314 @@

+import torch
+from models import MultimodalSentimentModel
+import os
+import cv2
+import numpy as np
+import subprocess
+import torchaudio
+from transformers import AutoTokenizer
+import whisper
+import sys
+EMOTION_MAP = {0: "anger", 1: "disgust", 2: "fear",
+               3: "joy", 4: "neutral", 5: "sadness", 6: "surprise"}
+SENTIMENT_MAP = {0: "negative", 1: "neutral", 2: "positive"}
+def install_ffmpeg():
+    print("Starting Ffmpeg installation...")
+    subprocess.check_call([sys.executable, "-m", "pip",
+                          "install", "--upgrade", "pip"])
+    subprocess.check_call([sys.executable, "-m", "pip",
+                          "install", "--upgrade", "setuptools"])
+    try:
+        subprocess.check_call([sys.executable, "-m", "pip",
+                               "install", "ffmpeg-python"])
+        print("Installed ffmpeg-python successfully")
+    except subprocess.CalledProcessError as e:
+        print("Failed to install ffmpeg-python via pip")
+    try:
+        subprocess.check_call([
+            "wget",
+            "https://johnvansickle.com/ffmpeg/releases/ffmpeg-release-amd64-static.tar.xz",
+            "-O", "/tmp/ffmpeg.tar.xz"
+        ])
+        subprocess.check_call([
+            "tar", "-xf", "/tmp/ffmpeg.tar.xz", "-C", "/tmp/"
+        ])
+        result = subprocess.run(
+            ["find", "/tmp", "-name", "ffmpeg", "-type", "f"],
+            capture_output=True,
+            text=True
+        )
+        ffmpeg_path = result.stdout.strip()
+        subprocess.check_call(["cp", ffmpeg_path, "/usr/local/bin/ffmpeg"])
+        subprocess.check_call(["chmod", "+x", "/usr/local/bin/ffmpeg"])
+        print("Installed static FFmpeg binary successfully")
+    except Exception as e:
+        print(f"Failed to install static FFmpeg: {e}")
+    try:
+        result = subprocess.run(["ffmpeg", "-version"],
+                                capture_output=True, text=True, check=True)
+        print("FFmpeg version:")
+        print(result.stdout)
+        return True
+    except (subprocess.CalledProcessError, FileNotFoundError):
+        print("FFmpeg installation verification failed")
+        return False
+class VideoProcessor:
+    def process_video(self, video_path):
+        cap = cv2.VideoCapture(video_path)
+        frames = []
+        try:
+            if not cap.isOpened():
+                raise ValueError(f"Video not found: {video_path}")
+            # Try and read first frame to validate video
+            ret, frame = cap.read()
+            if not ret or frame is None:
+                raise ValueError(f"Video not found: {video_path}")
+            # Reset index to not skip first frame
+            cap.set(cv2.CAP_PROP_POS_FRAMES, 0)
+            while len(frames) < 30 and cap.isOpened():
+                ret, frame = cap.read()
+                if not ret:
+                    break
+                frame = cv2.resize(frame, (224, 224))
+                frame = frame / 255.0
+                frames.append(frame)
+        except Exception as e:
+            raise ValueError(f"Video error: {str(e)}")
+        finally:
+            cap.release()
+        if (len(frames) == 0):
+            raise ValueError("No frames could be extracted")
+        # Pad or truncate frames
+        if len(frames) < 30:
+            frames += [np.zeros_like(frames[0])] * (30 - len(frames))
+        else:
+            frames = frames[:30]
+        # Before permute: [frames, height, width, channels]
+        # After permute: [frames, channels, height, width]
+        return torch.FloatTensor(np.array(frames)).permute(0, 3, 1, 2)
+class AudioProcessor:
+    def extract_features(self, video_path, max_length=300):
+        audio_path = video_path.replace('.mp4', '.wav')
+        try:
+            subprocess.run([
+                'ffmpeg',
+                '-i', video_path,
+                '-vn',
+                '-acodec', 'pcm_s16le',
+                '-ar', '16000',
+                '-ac', '1',
+                audio_path
+            ], check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+            waveform, sample_rate = torchaudio.load(audio_path)
+            if sample_rate != 16000:
+                resampler = torchaudio.transforms.Resample(sample_rate, 16000)
+                waveform = resampler(waveform)
+            mel_spectrogram = torchaudio.transforms.MelSpectrogram(
+                sample_rate=16000,
+                n_mels=64,
+                n_fft=1024,
+                hop_length=512
+            )
+            mel_spec = mel_spectrogram(waveform)
+            # Normalize
+            mel_spec = (mel_spec - mel_spec.mean()) / mel_spec.std()
+            if mel_spec.size(2) < 300:
+                padding = 300 - mel_spec.size(2)
+                mel_spec = torch.nn.functional.pad(mel_spec, (0, padding))
+            else:
+                mel_spec = mel_spec[:, :, :300]
+            return mel_spec
+        except subprocess.CalledProcessError as e:
+            raise ValueError(f"Audio extraction error: {str(e)}")
+        except Exception as e:
+            raise ValueError(f"Audio error: {str(e)}")
+        finally:
+            if os.path.exists(audio_path):
+                os.remove(audio_path)
+class VideoUtteranceProcessor:
+    def __init__(self):
+        self.video_processor = VideoProcessor()
+        self.audio_processor = AudioProcessor()
+    def extract_segment(self, video_path, start_time, end_time, temp_dir="/tmp"):
+        os.makedirs(temp_dir, exist_ok=True)
+        segment_path = os.path.join(
+            temp_dir, f"segment_{start_time}_{end_time}.mp4")
+        subprocess.run([
+            "ffmpeg", "-i", video_path,
+            "-ss", str(start_time),
+            "-to", str(end_time),
+            "-c:v", "libx264",
+            "-c:a", "aac",
+            "-y",
+            segment_path
+        ], check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+        if not os.path.exists(segment_path) or os.path.getsize(segment_path) == 0:
+            raise ValueError("Segment extraction failed: " + segment_path)
+        return segment_path
+def model_fn(model_dir):
+    # Load the model for inference
+    if not install_ffmpeg():
+        raise RuntimeError(
+            "FFmpeg installation failed - required for inference")
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model = MultimodalSentimentModel().to(device)
+    model_path = os.path.join(model_dir, 'model.pth')
+    if not os.path.exists(model_path):
+        model_path = os.path.join(model_dir, "saved_models", 'checkpoint.pth')
+        if not os.path.exists(model_path):
+            raise FileNotFoundError(
+                "Model file not found in path " + model_path)
+    print("Loading model from path: " + model_path)
+    model.load_state_dict(torch.load(
+        model_path, map_location=device, weights_only=True))
+    model.eval()
+    return {
+        'model': model,
+        'tokenizer': AutoTokenizer.from_pretrained('bert-base-uncased'),
+        'transcriber': whisper.load_model(
+            "base",
+            device="cpu" if device.type == "cpu" else device,
+        ),
+        'device': device
+    }
+def predict_fn(input_data, model_dict):
+    model = model_dict['model']
+    tokenizer = model_dict['tokenizer']
+    device = model_dict['device']
+    video_path = input_data['video_path']
+    result = model_dict['transcriber'].transcribe(
+        video_path, word_timestamps=True)
+    utterance_processor = VideoUtteranceProcessor()
+    predictions = []
+    for segment in result["segments"]:
+        try:
+            segment_path = utterance_processor.extract_segment(
+                video_path,
+                segment["start"],
+                segment["end"]
+            )
+            video_frames = utterance_processor.video_processor.process_video(
+                segment_path)
+            audio_features = utterance_processor.audio_processor.extract_features(
+                segment_path)
+            text_inputs = tokenizer(
+                segment["text"],
+                padding="max_length",
+                truncation=True,
+                max_length=128,
+                return_tensors="pt"
+            )
+            # Move to device
+            text_inputs = {k: v.to(device) for k, v in text_inputs.items()}
+            video_frames = video_frames.unsqueeze(0).to(device)
+            audio_features = audio_features.unsqueeze(0).to(device)
+            # Get predictions
+            with torch.inference_mode():
+                outputs = model(text_inputs, video_frames, audio_features)
+                emotion_probs = torch.softmax(outputs["emotions"], dim=1)[0]
+                sentiment_probs = torch.softmax(
+                    outputs["sentiments"], dim=1)[0]
+                emotion_values, emotion_indices = torch.topk(emotion_probs, 3)
+                sentiment_values, sentiment_indices = torch.topk(
+                    sentiment_probs, 3)
+            predictions.append({
+                "start_time": segment["start"],
+                "end_time": segment["end"],
+                "text": segment["text"],
+                "emotions": [
+                    {"label": EMOTION_MAP[idx.item()], "confidence": conf.item()} for idx, conf in zip(emotion_indices, emotion_values)
+                ],
+                "sentiments": [
+                    {"label": SENTIMENT_MAP[idx.item()], "confidence": conf.item()} for idx, conf in zip(sentiment_indices, sentiment_values)
+                ]
+            })
+        except Exception as e:
+            print("Segment failed inference: " + str(e))
+        finally:
+            # Cleanup
+            if os.path.exists(segment_path):
+                os.remove(segment_path)
+    return {"utterances": predictions}
+def process_local_video(video_path, model_dir="."):
+    model_dict = model_fn(model_dir)
+    input_data = {'video_path': video_path}
+    predictions = predict_fn(input_data, model_dict)
+    for utterance in predictions["utterances"]:
+        print("\nUtterance:")
+        print(f"""Start: {utterance['start_time']}s, End: {
+              utterance['end_time']}s""")
+        print(f"Text: {utterance['text']}")
+        print("\n Top Emotions:")
+        for emotion in utterance['emotions']:
+            print(f"{emotion['label']}: {emotion['confidence']:.2f}")
+        print("\n Top Sentiments:")
+        for sentiment in utterance['sentiments']:
+            print(f"{sentiment['label']}: {sentiment['confidence']:.2f}")
+        print("-"*50)
+if __name__ == "__main__":
+    process_local_video("./dia2_utt3.mp4")

models.py ADDED Viewed

	@@ -0,0 +1,137 @@

+import torch
+import torch.nn as nn
+from transformers import BertModel
+from torchvision import models as vision_models
+class TextEncoder(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.bert = BertModel.from_pretrained('bert-base-uncased')
+        for param in self.bert.parameters():
+            param.requires_grad = False
+        self.projection = nn.Linear(768, 128)
+    def forward(self, input_ids, attention_mask):
+        # Extract BERT embeddings
+        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
+        # Use [CLS] token representation
+        pooler_output = outputs.pooler_output
+        return self.projection(pooler_output)
+class VideoEncoder(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.backbone = vision_models.video.r3d_18(pretrained=True)
+        for param in self.backbone.parameters():
+            param.requires_grad = False
+        num_fts = self.backbone.fc.in_features
+        self.backbone.fc = nn.Sequential(
+            nn.Linear(num_fts, 128),
+            nn.ReLU(),
+            nn.Dropout(0.2)
+        )
+    def forward(self, x):
+        # [batch_size, frames, channels, height, width]->[batch_size, channels, frames, height, width]
+        x = x.transpose(1, 2)
+        return self.backbone(x)
+class AudioEncoder(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv_layers = nn.Sequential(
+            # Lower level features
+            nn.Conv1d(64, 64, kernel_size=3),
+            nn.BatchNorm1d(64),
+            nn.ReLU(),
+            nn.MaxPool1d(2),
+            # Higher level features
+            nn.Conv1d(64, 128, kernel_size=3),
+            nn.BatchNorm1d(128),
+            nn.ReLU(),
+            nn.AdaptiveAvgPool1d(1)
+        )
+        for param in self.conv_layers.parameters():
+            param.requires_grad = False
+        self.projection = nn.Sequential(
+            nn.Linear(128, 128),
+            nn.ReLU(),
+            nn.Dropout(0.2)
+        )
+    def forward(self, x):
+        x = x.squeeze(1)
+        features = self.conv_layers(x)
+        # Features output: [batch_size, 128, 1]
+        return self.projection(features.squeeze(-1))
+class MultimodalSentimentModel(nn.Module):
+    def __init__(self):
+        super().__init__()
+        # Encoders
+        self.text_encoder = TextEncoder()
+        self.video_encoder = VideoEncoder()
+        self.audio_encoder = AudioEncoder()
+        # Fusion layer
+        self.fusion_layer = nn.Sequential(
+            nn.Linear(128 * 3, 256),
+            nn.BatchNorm1d(256),
+            nn.ReLU(),
+            nn.Dropout(0.3)
+        )
+        # Classification heads
+        self.emotion_classifier = nn.Sequential(
+            nn.Linear(256, 64),
+            nn.ReLU(),
+            nn.Dropout(0.2),
+            nn.Linear(64, 7)  # Sadness, anger
+        )
+        self.sentiment_classifier = nn.Sequential(
+            nn.Linear(256, 64),
+            nn.ReLU(),
+            nn.Dropout(0.2),
+            nn.Linear(64, 3)  # Negative, positive, neutral
+        )
+    def forward(self, text_inputs, video_frames, audio_features):
+        text_features = self.text_encoder(
+            text_inputs['input_ids'],
+            text_inputs['attention_mask'],
+        )
+        video_features = self.video_encoder(video_frames)
+        audio_features = self.audio_encoder(audio_features)
+        # Concatenate multimodal features
+        combined_features = torch.cat([
+            text_features,
+            video_features,
+            audio_features
+        ], dim=1)  # [batch_size, 128 * 3]
+        fused_features = self.fusion_layer(combined_features)
+        emotion_output = self.emotion_classifier(fused_features)
+        sentiment_output = self.sentiment_classifier(fused_features)
+        return {
+            'emotions': emotion_output,
+            'sentiments': sentiment_output
+        }

requirements.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+torch
+torchaudio
+torchvision
+transformers
+whisper
+opencv-python
+numpy
+soundfile
+ffmpeg-python
+fastapi
+uvicorn
+python-multipart
+gradio

saved_models/checkpoint.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e3f4c798f9853443a7acb6280033dc573c23861116cc00f5d0beccdc0e5caa8a
+size 572162402