Spaces:

MNGames
/

MBase

Sleeping

App Files Files Community

MNGames commited on 28 days ago

Commit

29cb7aa

•

1 Parent(s): e7b04ff

Update app.py

Browse files

Files changed (1) hide show

app.py +54 -62

app.py CHANGED Viewed

@@ -1,68 +1,77 @@
 import gradio as gr
-from transformers import VideoMAEForVideoClassification, VideoMAEImageProcessor
 import torch
-import cv2  # OpenCV for video processing
-# Model ID for video classification (UCF101 subset)
-model_id = "MCG-NJU/videomae-base"
-# Parameters for frame extraction
-TARGET_FRAME_COUNT = 16
-FRAME_SIZE = (224, 224)  # Expected frame size for the model
 def analyze_video(video):
     # Extract key frames from the video using OpenCV
-    frames = extract_key_frames(video, TARGET_FRAME_COUNT)
-    # Resize frames to the expected size
-    frames = [cv2.resize(frame, FRAME_SIZE) for frame in frames]
-    # Load model and feature extractor manually
-    model = VideoMAEForVideoClassification.from_pretrained(model_id)
-    processor = VideoMAEImageProcessor.from_pretrained(model_id)
-    # Prepare frames for the model
     inputs = processor(images=frames, return_tensors="pt")
-    # Make predictions
     with torch.no_grad():
-        outputs = model(**inputs)
     logits = outputs.logits
     predictions = torch.argmax(logits, dim=-1)
-    # Analyze predictions for insights related to the play
-    results = []
-    for prediction in predictions:
-        result = analyze_predictions_ucf101(prediction.item())
-        results.append(result)
-    # Aggregate results across frames and provide a final analysis
-    final_result = aggregate_results(results)
     return final_result
-def extract_key_frames(video, target_frame_count):
     cap = cv2.VideoCapture(video)
     frames = []
     frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
-    # Calculate interval for frame extraction
-    interval = max(1, frame_count // target_frame_count)
-    for i in range(0, frame_count, interval):
-        cap.set(cv2.CAP_PROP_POS_FRAMES, i)
         ret, frame = cap.read()
-        if ret:
-            frames.append(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))  # Convert to RGB
-        if len(frames) >= target_frame_count:
-            break
     cap.release()
     return frames
-def analyze_predictions_ucf101(prediction):
-    # Map prediction to action labels (this mapping is hypothetical)
     action_labels = {
         0: "running",
         1: "sliding",
@@ -70,37 +79,20 @@ def analyze_predictions_ucf101(prediction):
         # Add more labels as necessary
     }
     action = action_labels.get(prediction, "unknown")
-    relevant_actions = ["running", "sliding", "jumping"]
-    if action in relevant_actions:
-        if action == "sliding":
-            return "potentially safe"
-        elif action == "running":
-            return "potentially out"
-        else:
-            return "inconclusive"
-    else:
-        return "inconclusive"
 def aggregate_results(results):
-    # Combine insights from analyzing each frame (e.g., dominant action classes, confidence scores)
-    safe_count = results.count("potentially safe")
-    out_count = results.count("potentially out")
-    if safe_count > out_count:
-        return "Safe"
-    elif out_count > safe_count:
-        return "Out"
-    else:
-        return "Inconclusive"
 # Gradio interface
 interface = gr.Interface(
     fn=analyze_video,
     inputs="video",
     outputs="text",
-    title="Baseball Play Analysis (UCF101 Subset Exploration)",
-    description="Upload a video of a baseball play (safe/out at a base). This app explores using a video classification model (UCF101 subset) for analysis. Note: The model might not be specifically trained for baseball plays."
 )
-interface.launch(share=True)

 import gradio as gr
 import torch
+import cv2
+import numpy as np
+from transformers import VideoMAEForVideoClassification, VideoMAEImageProcessor
+# Model IDs for video classification (UCF101 subset)
+classification_model_id = "MCG-NJU/videomae-base"
+# Object detection model (you can replace this with a more accurate one if needed)
+object_detection_model = "yolov5s"
 def analyze_video(video):
     # Extract key frames from the video using OpenCV
+    frames = extract_key_frames(video)
+    # Load classification model and image processor
+    classification_model = VideoMAEForVideoClassification.from_pretrained(classification_model_id)
+    processor = VideoMAEImageProcessor.from_pretrained(classification_model_id)
+    # Prepare frames for the classification model
     inputs = processor(images=frames, return_tensors="pt")
+    # Make predictions using the classification model
     with torch.no_grad():
+        outputs = classification_model(**inputs)
     logits = outputs.logits
     predictions = torch.argmax(logits, dim=-1)
+    # Object detection and tracking (ball and baseman)
+    object_detection_results = []
+    for frame in frames:
+        ball_position = detect_object(frame, "ball")
+        baseman_position = detect_object(frame, "baseman")
+        object_detection_results.append((ball_position, baseman_position))
+    # Analyze predictions and object detection results
+    analysis_results = []
+    for prediction, (ball_position, baseman_position) in zip(predictions, object_detection_results):
+        result = analyze_frame(prediction.item(), ball_position, baseman_position)
+        analysis_results.append(result)
+    # Aggregate analysis results
+    final_result = aggregate_results(analysis_results)
     return final_result
+def extract_key_frames(video):
     cap = cv2.VideoCapture(video)
     frames = []
     frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    fps = int(cap.get(cv2.CAP_PROP_FPS))
+    for i in range(frame_count):
         ret, frame = cap.read()
+        if ret and i % (fps // 2) == 0:  # Extract a frame every half second
+            frames.append(frame)
     cap.release()
     return frames
+def detect_object(frame, object_type):
+    # Placeholder function for object detection (replace with actual implementation)
+    # Here, we assume that the object is detected at the center of the frame
+    h, w, _ = frame.shape
+    if object_type == "ball":
+        return (w // 2, h // 2)  # Return center coordinates for the ball
+    elif object_type == "baseman":
+        return (w // 2, h // 2)  # Return center coordinates for the baseman
+    else:
+        return None
+def analyze_frame(prediction, ball_position, baseman_position):
+    # Placeholder function for analyzing a single frame
+    # You can replace this with actual logic based on your requirements
     action_labels = {
         0: "running",
         1: "sliding",
         # Add more labels as necessary
     }
     action = action_labels.get(prediction, "unknown")
+    return {"action": action, "ball_position": ball_position, "baseman_position": baseman_position}
 def aggregate_results(results):
+    # Placeholder function for aggregating analysis results
+    # You can implement this based on your specific requirements
+    return results
 # Gradio interface
 interface = gr.Interface(
     fn=analyze_video,
     inputs="video",
     outputs="text",
+    title="Baseball Play Analysis",
+    description="Upload a video of a baseball play to analyze.",
 )
+interface.launch()