Souha-BH
/

BERT_Resnet50

PyTorch

Model card Files Files and versions Community

Souha-BH commited on Dec 10, 2024

Commit

73ca82a

verified ·

1 Parent(s): 07128b2

Create example_usage.py

Browse files

Files changed (1) hide show

example_usage.py +206 -0

example_usage.py ADDED Viewed

	@@ -0,0 +1,206 @@

+import torch
+from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig
+from torchvision import models, transforms
+import torch.nn as nn
+import os
+import json
+import cv2
+from PIL import Image
+import gradio as gr
+class MultimodalRiskBehaviorModel(nn.Module):
+    def __init__(self, text_model_name="bert-base-uncased", hidden_dim=512, dropout=0.3):
+        super(MultimodalRiskBehaviorModel, self).__init__()
+        # Text model using AutoModelForSequenceClassification
+        self.text_model_name = text_model_name
+        self.text_model = AutoModelForSequenceClassification.from_pretrained(text_model_name, num_labels=1)
+        # Visual model (ResNet50)
+        self.visual_model = models.resnet50(weights=models.ResNet50_Weights.DEFAULT)
+        visual_feature_dim = self.visual_model.fc.in_features
+        self.visual_model.fc = nn.Identity()
+        # Fusion and classification layer setup
+        text_feature_dim = self.text_model.config.hidden_size
+        self.fc1 = nn.Linear(text_feature_dim + visual_feature_dim, hidden_dim)
+        self.dropout = nn.Dropout(dropout)
+        self.fc2 = nn.Linear(hidden_dim, 1)
+    def forward(self, encoding, frames):
+        input_ids = encoding['input_ids'].squeeze(1).to(device)
+        attention_mask = encoding['attention_mask'].squeeze(1).to(device)
+        # Extract text and visual features
+        text_features = self.text_model(input_ids=input_ids, attention_mask=attention_mask).logits
+        frames = frames.to(device)
+        batch_size, num_frames, channels, height, width = frames.size()
+        frames = frames.view(batch_size * num_frames, channels, height, width)
+        visual_features = self.visual_model(frames)
+        visual_features = visual_features.view(batch_size, num_frames, -1).mean(dim=1)
+        # Combine and classify
+        combined_features = torch.cat((text_features, visual_features), dim=1)
+        x = self.dropout(torch.relu(self.fc1(combined_features)))
+        output = torch.sigmoid(self.fc2(x))
+        return output
+    def save_pretrained(self, save_directory):
+        os.makedirs(save_directory, exist_ok=True)
+        torch.save(self.state_dict(), os.path.join(save_directory, 'pytorch_model.bin'))
+        config = {
+            "text_model_name": self.text_model_name,
+            "hidden_dim": self.fc1.out_features
+        }
+        with open(os.path.join(save_directory, 'config.json'), 'w') as f:
+            json.dump(config, f)
+    @classmethod
+    def from_pretrained(cls, load_directory, map_location=None):
+        if os.path.exists(load_directory):
+            config_path = os.path.join(load_directory, 'config.json')
+            state_dict_path = os.path.join(load_directory, 'pytorch_model.bin')
+            with open(config_path, 'r') as f:
+                config_dict = json.load(f)
+            model = cls(text_model_name=config_dict["text_model_name"], hidden_dim=config_dict["hidden_dim"])
+            state_dict = torch.load(state_dict_path, map_location=map_location)
+            model.load_state_dict(state_dict)
+        else:
+            hf_model = AutoModelForSequenceClassification.from_pretrained(load_directory, num_labels=2)
+            model = cls(text_model_name=hf_model.config.name_or_path, hidden_dim=hf_model.config.hidden_size)
+            model.text_model = hf_model
+        return model
+tokenizer = AutoTokenizer.from_pretrained('Souha-BH/BERT_Resnet50')
+model = MultimodalRiskBehaviorModel.from_pretrained('Souha-BH/BERT_Resnet50') # if cpu add arg map_location='cpu'
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model.to(device)
+# Function to load frames from a video
+def load_frames_from_video(video_path, transform, num_frames=10):
+    cap = cv2.VideoCapture(video_path)
+    frames = []
+    frame_count = 0
+    while frame_count < num_frames:  # Limit to a number of frames for efficiency
+        success, frame = cap.read()
+        if not success:
+            break
+        # Convert frame (NumPy array) to PIL image and apply transformations
+        frame = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
+        frame = transform(frame)
+        frames.append(frame)
+        frame_count += 1
+    cap.release()
+    # Stack frames and add batch dimension (1, num_frames, channels, height, width)
+    frames = torch.stack(frames)
+    frames = frames.unsqueeze(0)  # Add batch dimension
+    return frames
+def predict_video(model, video_path, text_input, tokenizer, transform):
+    try:
+        # Set model to evaluation mode
+        model.eval()
+        # Tokenize the text input
+        encoding = tokenizer(
+            text_input, padding='max_length', truncation=True, max_length=128, return_tensors='pt'
+        )
+        encoding = {key: val.to(device) for key, val in encoding.items()}
+        # Load frames from the video
+        frames = load_frames_from_video(video_path, transform)
+        frames = frames.to(device)
+        # Log input shapes and devices
+        print(f"Encoding device: {next(iter(encoding.values())).device}, Frames shape: {frames.shape}")
+        # Perform forward pass through the model
+        with torch.no_grad():
+            output = model(encoding, frames)
+        # Apply sigmoid to get probability, then threshold to get prediction
+        prediction = (output.squeeze(-1) > 0.5).float()
+        return prediction.item()
+    except Exception as e:
+        print(f"Prediction error: {e}")
+        return "Error during prediction"
+transform = transforms.Compose([
+    transforms.Resize((224, 224)),
+    transforms.ToTensor(),
+    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+])
+# Define your video paths and captions
+video_paths = [
+    'https://drive.google.com/uc?export=download&id=1iWq1q1LM-jmf4iZxOqZTw4FaIBekJowM',
+    'https://drive.google.com/uc?export=download&id=1_egBaC1HD2kIZgRRKsnCtsWG94vg1c7n',
+    'https://drive.google.com/uc?export=download&id=12cGxBEkfU5Q1Ezg2jRk6zGyn2hoR3JLj'
+]
+video_captions = [
+    "Everytime i start a diet  كل مرة أحاول أبدأ ريجيم 😓 #dietmemes #funnyvideos #animetiktok",
+    "New sandwich from burger king 🍔👑 #mukbang #asmr #asmrmukbang #asmrsounds #eat #food #Foodie moe eats #yummy #cheese #chicken #burger #fries #burgerking @Burger King",
+    "all workout guides l!nked in bi0 // honestly huge moment 😂 I’ve been so focused on growing my upper body that this feels like it finally shows! shorts from @KEEPTHATPUMP #upperbody #upperbodyworkout #glutegains #glutegrowth #gluteexercise #workout #strengthtraining #gym #trending #fyp"
+]
+def predict_risk(video_index):
+    video_path = video_paths[video_index]
+    text_input = video_captions[video_index]
+    # Make prediction
+    prediction = predict_video(model, video_path, text_input, tokenizer, transform)
+    # Return the corresponding label
+    return "Risky Health Behavior" if prediction == 1 else "Not Risky Health Behavior"
+# Interface setup
+with gr.Blocks() as interface:
+    gr.Markdown("# Risk Behavior Prediction")
+    gr.Markdown("Select a video to classify its behavior as risky or not.")
+    # Input option selector
+    video_selector = gr.Radio(["Video 1", "Video 2", "Video 3"], label="Choose a Video")
+    # Use function to return URLs which are handled by the Gradio `gr.Video` component
+    def show_selected_video(choice):
+        idx = int(choice.split()[-1]) - 1
+        return video_paths[idx], f"**Caption:** {video_captions[idx]}"
+    video_player = gr.Video(width=320, height=240)
+    caption_box = gr.Markdown()
+    video_selector.change(
+        fn=show_selected_video,
+        inputs=video_selector,
+        outputs=[video_player, caption_box]
+    )
+    # Prediction button and output
+    predict_button = gr.Button("Predict Risk")
+    output_text = gr.Textbox(label="Prediction")
+    predict_button.click(
+        fn=lambda idx: predict_risk(int(idx.split()[-1]) - 1),
+        inputs=video_selector,
+        outputs=output_text
+    )
+# Launch the app
+interface.launch()