add ignored datset files

Browse files

Files changed (5) hide show

script/inference.py +1 -1
script/train.py +4 -4
script/visualization/visualize.py +1 -1
src/dataset/dataset.py +59 -0
src/dataset/video_utils.py +132 -0

script/inference.py CHANGED Viewed

@@ -6,7 +6,7 @@ sys.path.append(os.path.dirname(os.path.dirname(__file__)))
 from src.utils.utils import get_latest_run_dir, get_latest_model_path, get_config
 from src.models.model import load_model
-from src.data.video_utils import create_transform, extract_frames
 def setup_model(run_dir=None):
     """Setup model and configuration"""

 from src.utils.utils import get_latest_run_dir, get_latest_model_path, get_config
 from src.models.model import load_model
+from src.dataset.video_utils import create_transform, extract_frames
 def setup_model(run_dir=None):
     """Setup model and configuration"""

script/train.py CHANGED Viewed

@@ -12,9 +12,9 @@ import sys
 sys.path.append(os.path.dirname(os.path.dirname(__file__)))
 from src.utils.utils import create_run_directory
-from src.data.dataset import VideoDataset
 from src.models.model import create_model
-from src.data.video_utils import create_transform
 def train_and_evaluate(config):
     # Create a run directory if it doesn't exist
@@ -228,11 +228,11 @@ def main():
     config = {
         "class_labels": class_labels,
         "num_classes": len(class_labels),
-        "data_path": '../finetune/3moves_otherpeopletrain',
         "batch_size": 32,
         "learning_rate": 2e-6,
         "weight_decay": 0.007,
-        "num_epochs": 1,
         "patience": 10,  # for early stopping
         "max_frames": 10,
         "sigma": 0.3,

 sys.path.append(os.path.dirname(os.path.dirname(__file__)))
 from src.utils.utils import create_run_directory
+from src.dataset.dataset import VideoDataset
 from src.models.model import create_model
+from src.dataset.video_utils import create_transform
 def train_and_evaluate(config):
     # Create a run directory if it doesn't exist
     config = {
         "class_labels": class_labels,
         "num_classes": len(class_labels),
+        "data_path": '../finetune/3moves_otherpeopleval',
         "batch_size": 32,
         "learning_rate": 2e-6,
         "weight_decay": 0.007,
+        "num_epochs": 50,
         "patience": 10,  # for early stopping
         "max_frames": 10,
         "sigma": 0.3,

script/visualization/visualize.py CHANGED Viewed

@@ -9,7 +9,7 @@ import os
 import sys
 sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
-from src.data.dataset import VideoDataset
 from src.utils.utils import get_latest_model_path, get_latest_run_dir, get_config
 from src.models.model import load_model

 import sys
 sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
+from src.dataset.dataset import VideoDataset
 from src.utils.utils import get_latest_model_path, get_latest_run_dir, get_config
 from src.models.model import load_model

src/dataset/dataset.py ADDED Viewed

	@@ -0,0 +1,59 @@

+import torch
+from torch.utils.data import Dataset
+import csv
+from .video_utils import create_transform, extract_frames
+class VideoDataset(Dataset):
+    def __init__(self, file_path, config, transform=None):
+        self.data = []
+        self.label_map = {}
+        # Use create_transform if no custom transform is provided
+        self.transform = transform or create_transform(config)
+        # Validate required config keys
+        required_keys = {"max_frames", "sigma", "class_labels"}
+        missing_keys = required_keys - set(config.keys())
+        if missing_keys:
+            raise ValueError(f"Missing required config keys: {missing_keys}")
+        self.max_frames = config['max_frames']
+        self.sigma = config['sigma']
+        # Create label map from class_labels list
+        self.label_map = {i: label for i, label in enumerate(config['class_labels'])}
+        # Read the CSV file and parse the data
+        with open(file_path, 'r') as file:
+            csv_reader = csv.reader(file)
+            for row in csv_reader:
+                if len(row) != 2:
+                    print(f"Skipping invalid row: {row}")
+                    continue
+                video_path, label = row
+                try:
+                    label = int(label)
+                except ValueError:
+                    print(f"Skipping row with invalid label: {row}")
+                    continue
+                self.data.append((video_path, label))
+        if not self.data:
+            raise ValueError(f"No valid data found in the CSV file: {file_path}")
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, idx):
+        video_path, label = self.data[idx]
+        frames, success = extract_frames(video_path,
+                                      {"max_frames": self.max_frames, "sigma": self.sigma},
+                                      self.transform)
+        if not success:
+            frames = self._get_error_tensor()
+        return frames, label, video_path
+    def _get_error_tensor(self):
+        return torch.zeros((self.max_frames, 3, 224, 224))

src/dataset/video_utils.py ADDED Viewed

	@@ -0,0 +1,132 @@

+import cv2
+import numpy as np
+import torch
+from torchvision import transforms
+from scipy.stats import norm
+import os
+def create_transform(config, training=False):
+    """Create transform pipeline based on config"""
+    # Validate base required keys
+    required_keys = {
+        "image_size",
+        "normalization_mean",
+        "normalization_std"
+    }
+    # Add training-specific required keys
+    if training:
+        required_keys.update({
+            "flip_probability",
+            "rotation_degrees",
+            "brightness_jitter",
+            "contrast_jitter",
+            "saturation_jitter",
+            "hue_jitter",
+            "crop_scale_min",
+            "crop_scale_max"
+        })
+    missing_keys = required_keys - set(config.keys())
+    if missing_keys:
+        raise ValueError(f"Missing required config keys: {missing_keys}")
+    # Build transform list
+    transform_list = [
+        transforms.ToPILImage(),
+        transforms.Resize((config["image_size"], config["image_size"]))
+    ]
+    # Add training augmentations if needed
+    if training:
+        transform_list.extend([
+            transforms.RandomHorizontalFlip(p=config["flip_probability"]),
+            transforms.RandomRotation(config["rotation_degrees"]),
+            transforms.ColorJitter(
+                brightness=config["brightness_jitter"],
+                contrast=config["contrast_jitter"],
+                saturation=config["saturation_jitter"],
+                hue=config["hue_jitter"]
+            ),
+            transforms.RandomResizedCrop(
+                config["image_size"],
+                scale=(config["crop_scale_min"], config["crop_scale_max"])
+            )
+        ])
+    # Add final transforms
+    transform_list.extend([
+        transforms.ToTensor(),
+        transforms.Normalize(
+            mean=config["normalization_mean"],
+            std=config["normalization_std"]
+        )
+    ])
+    return transforms.Compose(transform_list)
+def extract_frames(video_path: str, config: dict, transform) -> tuple[torch.Tensor, bool]:
+    """Extract and process frames from video using Gaussian sampling
+    Returns:
+        tuple: (frames tensor, success boolean)
+    """
+    # Validate required config keys
+    required_keys = {"max_frames", "sigma"}
+    missing_keys = required_keys - set(config.keys())
+    if missing_keys:
+        raise ValueError(f"Missing required config keys for frame extraction: {missing_keys}")
+    frames = []
+    success = True
+    if not os.path.exists(video_path):
+        print(f"File not found: {video_path}")
+        return None, False
+    cap = cv2.VideoCapture(video_path)
+    if not cap.isOpened():
+        print(f"Failed to open video: {video_path}")
+        return None, False
+    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    if total_frames == 0:
+        print(f"Video has no frames: {video_path}")
+        cap.release()
+        return None, False
+    # Create a normal distribution centered at the middle of the video
+    x = np.linspace(0, 1, total_frames)
+    probabilities = norm.pdf(x, loc=0.5, scale=config["sigma"])
+    probabilities /= probabilities.sum()
+    # Sample frame indices based on this distribution
+    frame_indices = np.sort(np.random.choice(
+        total_frames,
+        size=min(config["max_frames"], total_frames),
+        replace=False,
+        p=probabilities
+    ))
+    for frame_idx in frame_indices:
+        cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx)
+        ret, frame = cap.read()
+        if not ret:
+            print(f"Failed to read frame {frame_idx} from video: {video_path}")
+            success = False
+            break
+        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+        if transform:
+            frame = transform(frame)
+        frames.append(frame)
+    cap.release()
+    if not frames:
+        print(f"No frames extracted from video: {video_path}")
+        return None, False
+    # Pad with zeros if we don't have enough frames
+    while len(frames) < config["max_frames"]:
+        frames.append(torch.zeros_like(frames[0]))
+    return torch.stack(frames), success