Spaces:

thienphuc12339
/

SignLanguage

Runtime error

App Files Files Community

thienphuc12339 commited on Dec 19, 2024

Commit

9f83ce9

1 Parent(s): 4cc0c69

Add all source code

Browse files

Files changed (40) hide show

.dockerignore +12 -0
Dockerfile +27 -0
README.md +29 -9
__init__.py +3 -0
app.py +311 -0
models/dsta_slr_joint_motion_v3_0.onnx +3 -0
models/sl_gcn_joint_v3_0.onnx +3 -0
models/spoter_v3.0.onnx +3 -0
request.py +15 -0
requirements.txt +23 -0
src/configs/__init__.py +1 -0
src/configs/arguments.py +174 -0
src/data/__init__.py +1 -0
src/data/__pycache__/__init__.cpython-312.pyc +0 -0
src/data/__pycache__/__init__.cpython-39.pyc +0 -0
src/data/__pycache__/utils.cpython-312.pyc +0 -0
src/data/__pycache__/utils.cpython-39.pyc +0 -0
src/data/utils.py +157 -0
src/inference.py +271 -0
src/main.py +51 -0
src/tools/__init__.py +3 -0
src/tools/__pycache__/__init__.cpython-312.pyc +0 -0
src/tools/__pycache__/__init__.cpython-39.pyc +0 -0
src/tools/__pycache__/features.cpython-39.pyc +0 -0
src/tools/__pycache__/models.cpython-312.pyc +0 -0
src/tools/__pycache__/models.cpython-39.pyc +0 -0
src/tools/features.py +29 -0
src/tools/models.py +441 -0
src/utils/__init__.py +2 -0
src/utils/__pycache__/__init__.cpython-312.pyc +0 -0
src/utils/__pycache__/constants.cpython-312.pyc +0 -0
src/utils/__pycache__/loggers.cpython-312.pyc +0 -0
src/utils/constants.py +158 -0
src/utils/loggers.py +24 -0
src/visualization/__init__.py +1 -0
src/visualization/__pycache__/__init__.cpython-312.pyc +0 -0
src/visualization/__pycache__/__init__.cpython-39.pyc +0 -0
src/visualization/__pycache__/utils.cpython-312.pyc +0 -0
src/visualization/__pycache__/utils.cpython-39.pyc +0 -0
src/visualization/utils.py +55 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,12 @@

+# Ignore build artifacts
+*.log
+*.tmp
+# Ignore compiled Python files
+__pycache__/
+*.pyc
+*.pyo
+*.pyd
+# Ignore files/directories
+# engines/data/

Dockerfile ADDED Viewed

	@@ -0,0 +1,27 @@

+FROM python:3.10-slim
+# Tắt buffering để log ra terminal ngay lập tức
+ENV PYTHONUNBUFFERED=1
+# Cài đặt các thư viện hệ thống cần thiết
+RUN apt-get update && apt-get install -y \
+    libgl1-mesa-glx \
+    libglib2.0-0 \
+    && rm -rf /var/lib/apt/lists/*
+WORKDIR /app
+# Sao chép requirements.txt vào container và cài đặt
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+# Sao chép toàn bộ code vào container
+COPY . .
+# Thiết lập biến môi trường PORT (Hugging Face sẽ trỏ traffic vào port này)
+ENV PORT 7860
+EXPOSE 7860
+# Chạy ứng dụng FastAPI bằng uvicorn
+# Ở đây giả sử file main app của bạn là app.py và app là tên biến FastAPI instance
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

README.md CHANGED Viewed

@@ -1,10 +1,30 @@
----
-title: SignLanguage
-emoji: 🦀
-colorFrom: purple
-colorTo: indigo
-sdk: docker
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# Vietnamese Sign Language Translation
+The Vietnamese Sign Language Translation is a project focused on developing advanced AI technology to accurately interpret Vietnamese sign language through body movements.
+## Installation
+1. Create an environment with `Python == 3.9.19`
+2. Install `Pytorchvideo`
+   ```
+   cd src/libs
+   git clone https://github.com/facebookresearch/pytorchvideo.git
+   pip install -e pytorchvideo
+   ```
+3. Install other requirements
+   ```
+   cd ../..
+   pip install -r requirements.txt
+   ```
+## Inference
+1. Prepare configurations for inference. Template for each architecture can be found at src/configs.
+2. Modify the inference config:
+   ```
+   inference:
+      source: webcam or path/to/video.mp4
+      output_dir: path/to/output/dir
+   ```
+3. Enter this command from `root` directory of the project to start inference.
+   ```
+    python src/inference.py --config_path path/to/config.yaml
+   ```

__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+# WRITER: PhucNTT2 # EMAIL: thienphuc12339@gmail.com # DATE: 11/2023
+# FROM: akaOCR Team
+# ALL USE CASES MUST BE APPROVED BY AKAOCR TEAM

app.py ADDED Viewed

	@@ -0,0 +1,311 @@

+import logging
+from time import time
+import pandas as pd
+import numpy as np
+import cv2
+from typing import Optional
+from pathlib import Path
+from fastapi import FastAPI, HTTPException, UploadFile, File, Query
+from fastapi.responses import JSONResponse
+import mediapipe as mp
+from configs import ModelConfig, InferenceConfig
+from utils import config_logger, POSE_BASED_MODELS
+from data import Arm, get_sample_timestamp, ok_to_get_frame
+from tools import load_pipeline, Predictions
+from visualization import draw_text_on_image
+app = FastAPI()
+# Định nghĩa ba preset model
+MODEL_PRESETS = {
+    "dsta_slr": {
+        "model": ModelConfig(
+            arch="dsta_slr",
+            pretrained="vsltranslation/dsta_slr_joint_motion_v3_0",
+        ),
+        "inference": InferenceConfig(
+            source="upload",  # Sử dụng upload, không webcam
+            output_dir="demo/run_1",
+            use_onnx=True,
+            show_skeleton=True,
+            visualize=True,
+            bone_stream=False,
+            motion_stream=True,
+        ),
+    },
+    "sl_gcn": {
+        "model": ModelConfig(
+            arch="sl_gcn",
+            pretrained="models/dsta_slr_joint_motion_v3_0.onnx",
+        ),
+        "inference": InferenceConfig(
+            source="upload",
+            output_dir="demo/run_1",
+            use_onnx=True,
+            show_skeleton=True,
+            visualize=True,
+            bone_stream=True,
+            motion_stream=False,
+        ),
+    },
+    "spoter": {
+        "model": ModelConfig(
+            arch="spoter",
+            pretrained="vsltranslation/spoter_v3.0",
+        ),
+        "inference": InferenceConfig(
+            source="upload",
+            output_dir="demo/run_1",
+            use_onnx=True,
+            show_skeleton=True,
+            visualize=True,
+        ),
+    },
+}
+config_logger("inference.log")
+logging.info("API started")
+SPOTER_POSE_LANDMARKS = [
+    mp.solutions.pose.PoseLandmark.NOSE,
+    mp.solutions.pose.PoseLandmark.LEFT_EYE,
+    mp.solutions.pose.PoseLandmark.RIGHT_EYE,
+    mp.solutions.pose.PoseLandmark.RIGHT_SHOULDER,
+    mp.solutions.pose.PoseLandmark.LEFT_SHOULDER,
+    mp.solutions.pose.PoseLandmark.RIGHT_ELBOW,
+    mp.solutions.pose.PoseLandmark.LEFT_ELBOW,
+    mp.solutions.pose.PoseLandmark.RIGHT_WRIST,
+    mp.solutions.pose.PoseLandmark.LEFT_WRIST
+]
+SPOTER_HAND_LANDMARKS = [
+    mp.solutions.hands.HandLandmark.WRIST,
+    mp.solutions.hands.HandLandmark.INDEX_FINGER_TIP, mp.solutions.hands.HandLandmark.INDEX_FINGER_DIP,
+    mp.solutions.hands.HandLandmark.INDEX_FINGER_PIP, mp.solutions.hands.HandLandmark.INDEX_FINGER_MCP,
+    mp.solutions.hands.HandLandmark.MIDDLE_FINGER_TIP, mp.solutions.hands.HandLandmark.MIDDLE_FINGER_DIP,
+    mp.solutions.hands.HandLandmark.MIDDLE_FINGER_PIP, mp.solutions.hands.HandLandmark.MIDDLE_FINGER_MCP,
+    mp.solutions.hands.HandLandmark.RING_FINGER_TIP, mp.solutions.hands.HandLandmark.RING_FINGER_DIP,
+    mp.solutions.hands.HandLandmark.RING_FINGER_PIP, mp.solutions.hands.HandLandmark.RING_FINGER_MCP,
+    mp.solutions.hands.HandLandmark.PINKY_TIP, mp.solutions.hands.HandLandmark.PINKY_DIP,
+    mp.solutions.hands.HandLandmark.PINKY_PIP, mp.solutions.hands.HandLandmark.PINKY_MCP,
+    mp.solutions.hands.HandLandmark.THUMB_TIP, mp.solutions.hands.HandLandmark.THUMB_IP,
+    mp.solutions.hands.HandLandmark.THUMB_MCP, mp.solutions.hands.HandLandmark.THUMB_CMC,
+]
+@app.get("/healthcheck")
+async def healthcheck():
+    return JSONResponse(status_code=200, content={"status": "UP"})
+def run_inference(model_config, inference_config, input_frames):
+    pipeline = load_pipeline(model_config, inference_config)
+    logging.info("Pipeline loaded")
+    right_arm = Arm("right", inference_config.visibility)
+    left_arm = Arm("left", inference_config.visibility)
+    data = []
+    results = None
+    predictions = Predictions()
+    mp_holistic = mp.solutions.holistic
+    mp_drawing = mp.solutions.drawing_utils
+    mp_drawing_styles = mp.solutions.drawing_styles
+    custom_pose_style = mp_drawing_styles.get_default_pose_landmarks_style()
+    custom_right_hand_style = mp_drawing_styles.get_default_hand_landmarks_style()
+    custom_left_hand_style = mp_drawing_styles.get_default_hand_landmarks_style()
+    custom_pose_connections = list(mp_holistic.POSE_CONNECTIONS)
+    custom_hand_connections = list(mp_holistic.HAND_CONNECTIONS)
+    if inference_config.show_skeleton:
+        pose_landmarks = SPOTER_POSE_LANDMARKS
+        hand_landmarks = SPOTER_HAND_LANDMARKS
+        for landmark in mp.solutions.pose.PoseLandmark:
+            if landmark in pose_landmarks:
+                custom_pose_style[landmark] = mp.drawing.DrawingSpec(color=(0,255,0), thickness=2, circle_radius=2)
+            else:
+                custom_pose_style[landmark] = mp.drawing.DrawingSpec(color=(0,0,0), thickness=0, circle_radius=0)
+                for connection_tuple in custom_pose_connections:
+                    if landmark.value in connection_tuple:
+                        custom_pose_connections.remove(connection_tuple)
+        for landmark in mp.solutions.hands.HandLandmark:
+            if landmark in hand_landmarks:
+                custom_right_hand_style[landmark] = mp.drawing.DrawingSpec(color=(0,0,255), thickness=2, circle_radius=2)
+                custom_left_hand_style[landmark] = mp.drawing.DrawingSpec(color=(255,0,0), thickness=2, circle_radius=2)
+            else:
+                custom_right_hand_style[landmark] = mp.drawing.DrawingSpec(color=(0,0,0), thickness=0, circle_radius=0)
+                custom_left_hand_style[landmark] = mp.drawing.DrawingSpec(color=(0,0,0), thickness=0, circle_radius=0)
+                for connection_tuple in custom_hand_connections:
+                    if landmark.value in connection_tuple:
+                        custom_hand_connections.remove(connection_tuple)
+    writer = None
+    if inference_config.output_dir is not None:
+        out_path = Path(inference_config.output_dir)
+        out_path.mkdir(parents=True, exist_ok=True)
+        if len(input_frames) > 0 and isinstance(input_frames[0], np.ndarray):
+            h, w, _ = input_frames[0].shape
+            writer = cv2.VideoWriter(str(out_path / "output.mp4"), cv2.VideoWriter_fourcc(*"mp4v"), 30, (w, h))
+    with mp_holistic.Holistic(min_detection_confidence=0.9, min_tracking_confidence=0.5) as holistic:
+        # giả định mỗi frame ~33ms, ở đây chỉ là demo logic
+        current_time_ms = 0
+        for frame in input_frames:
+            rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+            rgb_frame.flags.writeable = False
+            detection_results = holistic.process(rgb_frame)
+            try:
+                landmarks = detection_results.pose_landmarks.landmark
+            except:
+                current_time_ms += 33
+                continue
+            left_arm.set_pose(landmarks)
+            right_arm.set_pose(landmarks)
+            left_arm_ok_to_get_frame = ok_to_get_frame(
+                arm=left_arm,
+                angle_threshold=inference_config.angle_threshold,
+                min_num_up_frames=inference_config.min_num_up_frames,
+                min_num_down_frames=inference_config.min_num_down_frames,
+                current_time=current_time_ms,
+                delay=inference_config.delay,
+            )
+            right_arm_ok_to_get_frame = ok_to_get_frame(
+                arm=right_arm,
+                angle_threshold=inference_config.angle_threshold,
+                min_num_up_frames=inference_config.min_num_up_frames,
+                min_num_down_frames=inference_config.min_num_down_frames,
+                current_time=current_time_ms,
+                delay=inference_config.delay,
+            )
+            if left_arm_ok_to_get_frame or right_arm_ok_to_get_frame:
+                predictions = Predictions()
+                data.append(detection_results if inference_config.use_pose_model else frame)
+            start_time, end_time = get_sample_timestamp(left_arm, right_arm)
+            start_time /= 1000
+            end_time /= 1000
+            if start_time != 0 and end_time != 0:
+                start_inference_time = time()
+                predictions = Predictions(predictions=pipeline(np.array(data)))
+                predictions.inference_time = time() - start_inference_time
+                predictions.start_time = start_time
+                predictions.end_time = end_time
+                logging.info(str(predictions))
+                results = predictions.merge_results(results)
+                # Reset
+                start_time = 0
+                end_time = 0
+                left_arm.reset_state()
+                right_arm.reset_state()
+                data = []
+            # Vẽ kết quả
+            frame = left_arm.visualize(frame, (20, 10), "Left arm angle")
+            frame = right_arm.visualize(frame, (20, 40), "Right arm angle")
+            frame = predictions.visualize(frame, (20, 70))
+            if inference_config.show_skeleton:
+                mp.drawing.draw_landmarks(
+                    frame,
+                    detection_results.pose_landmarks,
+                    connections=custom_pose_connections,
+                    landmark_drawing_spec=custom_pose_style
+                )
+                mp.drawing.draw_landmarks(
+                    frame,
+                    detection_results.right_hand_landmarks,
+                    connections=custom_hand_connections,
+                    landmark_drawing_spec=custom_right_hand_style
+                )
+                mp.drawing.draw_landmarks(
+                    frame,
+                    detection_results.left_hand_landmarks,
+                    connections=custom_hand_connections,
+                    landmark_drawing_spec=custom_left_hand_style
+                )
+            if writer is not None:
+                writer.write(frame)
+            current_time_ms += 33
+    if writer is not None:
+        writer.release()
+    if results is not None:
+        pd.DataFrame(results).to_csv(Path(inference_config.output_dir) / "results.csv", index=False)
+    return predictions.predictions, results
+@app.post("/inference")
+async def inference_endpoint(
+    model_name: str = Query(..., description="Choose model: dsta_slr, sl_gcn, spoter"),
+    output_option: str = Query("all", description="Output option: 'predictions', 'csv', 'video', 'all'"),
+    output_dir: str = Query("demo/run_1", description="Output directory for results"),
+    file: UploadFile = File(...)
+):
+    """
+    Inference endpoint:
+    - model_name: chọn mô hình: dsta_slr, sl_gcn, spoter
+    - output_option: 'predictions', 'csv', 'video', hoặc 'all'
+    - output_dir: thư mục output, vd: 'my_results'
+    - file: upload 1 file video
+    """
+    if model_name not in MODEL_PRESETS:
+        raise HTTPException(status_code=400, detail="Invalid model_name")
+    # Đọc video từ file upload
+    video_bytes = np.asarray(bytearray(await file.read()), dtype=np.uint8)
+    temp_video_path = Path("temp_input.mp4")
+    with open(temp_video_path, "wb") as f:
+        f.write(video_bytes)
+    cap = cv2.VideoCapture(str(temp_video_path))
+    input_frames = []
+    while True:
+        ret, frame = cap.read()
+        if not ret:
+            break
+        input_frames.append(frame)
+    cap.release()
+    # Load config từ preset
+    model_config = MODEL_PRESETS[model_name]["model"]
+    inference_config = MODEL_PRESETS[model_name]["inference"]
+    # Ghi đè output_dir theo yêu cầu người dùng
+    inference_config.output_dir = output_dir
+    if model_config.arch in POSE_BASED_MODELS:
+        inference_config.use_pose_model = True
+    else:
+        inference_config.use_pose_model = False
+    predictions, results = run_inference(model_config, inference_config, input_frames)
+    resp = {}
+    out_dir = Path(inference_config.output_dir)
+    if predictions is None:
+        predictions = []
+    if output_option in ["predictions", "all"]:
+        resp["predictions"] = predictions
+    if output_option in ["csv", "all"]:
+        csv_path = str(out_dir / "results.csv")
+        resp["csv_path"] = csv_path if Path(csv_path).exists() else None
+    if output_option in ["video", "all"]:
+        video_path = str(out_dir / "output.mp4")
+        resp["video_path"] = video_path if Path(video_path).exists() else None
+    return resp

models/dsta_slr_joint_motion_v3_0.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ecfcb2b459fd68bfe838569d41bdb502f7cd21ddd675790146034cf0e6f71632
+size 29678372

models/sl_gcn_joint_v3_0.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3ab4e3b86ec2a828c9e8f72f1f80ca131c0b7439539412fe15244dbcb64fb2a1
+size 17046336

models/spoter_v3.0.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:38c21cd96446475cdc110f7748b11ad58b84cd055133379684f9f463dea8fcbd
+size 24208453

request.py ADDED Viewed

	@@ -0,0 +1,15 @@

+import requests
+url = 'https://<your-hf-space-url>.hf.space/inference'  # URL thực tế sau khi deploy lên HF
+video_path = '/path/to/your_video.mp4'
+params = {
+    'model_name': 'spoter',
+    'output_option': 'all',
+    'output_dir': 'custom_output_folder'  # người dùng có thể chọn folder output
+}
+files = {
+    'file': open(video_path, 'rb')
+}
+response = requests.post(url=url, files=files, params=params)
+print(response.json())

requirements.txt ADDED Viewed

	@@ -0,0 +1,23 @@

+transformers
+pandas
+evaluate
+simple-parsing
+torch
+torchvision
+hf-transfer
+decord
+accelerate
+scikit-learn
+wandb
+pose-format
+torchsummary
+mediapipe
+opencv-python
+onnxruntime
+onnx
+imageio
+tk
+timm
+einops
+fastapi
+uvicorn

src/configs/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .arguments import *

src/configs/arguments.py ADDED Viewed

	@@ -0,0 +1,174 @@

+from pathlib import Path
+from typing import Any
+from dataclasses import dataclass, field
+from utils import MODELS, VIDEO_EXTENSIONS
+@dataclass
+class TransformConfig:
+    # RGB specific
+    horizontal_flip_prob: float = 0.5
+    aug_type: str = "augmix"
+    aug_paras: dict = field(
+        default_factory=lambda: {
+            "magnitude": 3,
+            "alpha": 1.0,
+            "width": 5,
+            "depth": -1,
+        }
+    )
+    sample_rate: int = 4
+    # Pose specific
+    normalization: bool = True
+    # SL-GCN, DSTA-SLR specific
+    random_choose: bool = False
+    random_shift: bool = False
+    random_move: bool = False
+    random_mirror: bool = False
+    random_mirror_p: float = 0.5
+    bone_stream: bool = False
+    motion_stream: bool = False
+    # SPOTER specific
+    augmentation: bool = True
+    aug_prob: float = 0.5
+    noise: bool = True
+    def __post_init__(self):
+        assert self.aug_type in ["augmix", "mixup"], \
+            "Only AugMix and MixUp are supported for now"
+@dataclass
+class DataConfig:
+    dataset: str = "vsl"
+    modality: str = "rgb"
+    subset: str = None
+    data_dir: str = "data/processed/vsl"
+    transform: Any = None
+    fps: int = 30
+    debug: bool = False
+    # transform: TransformConfig = TransformConfig()
+    transform: TransformConfig = field(default_factory=TransformConfig)
+    def __post_init__(self):
+        assert self.dataset in ["vsl_98", "vsl_400"], \
+            "Only VSL dataset is supported for now"
+        assert self.modality in ["rgb", "pose"], \
+            "Only RGB and Pose modalities are supported for now"
+@dataclass
+class ModelConfig:
+    arch: str = "sl_gcn"
+    pretrained: str = "vsltranslation/sl_gcn_joint_v3_0"
+    num_frozen_layers: int = 0
+    ignored_weights: list = field(default_factory=lambda: [])
+    num_frames: int = 16
+    # SL-GCN specific
+    num_points: int = 27
+    groups: int = 8
+    block_size: int = 41
+    in_channels: int = 3
+    labeling_mode: str = "spatial"
+    is_vector: bool = False
+    # DSTA-SLR specific
+    graph: str = "wlasl"
+    inner_dim: int = 64
+    drop_layers: int = 2
+    depth: int = 4
+    s_num_heads: int = 1
+    window_size: int = 120
+    # SPOTER specific
+    hidden_dim: int = 108
+    def __post_init__(self):
+        assert self.arch in MODELS, f"Model {self.arch} is not supported"
+@dataclass
+class TrainingConfig:
+    output_dir: str = "experiments"
+    remove_unused_columns: bool = False
+    do_train: bool = True
+    use_cpu: bool = False
+    eval_strategy: str = "epoch"
+    logging_strategy: str = "epoch"
+    save_strategy: str = "epoch"
+    logging_steps: int = 1
+    save_steps: int = 1
+    eval_steps: int = 1
+    learning_rate: float = 5e-5
+    weight_decay: float = 0
+    adam_beta1: float = 0.9
+    adam_beta2: float = 0.999
+    adam_epsilon: float = 1e-8
+    warmup_ratio: float = 0.1
+    num_train_epochs: int = 10
+    per_device_train_batch_size: int = 8
+    per_device_eval_batch_size: int = 8
+    dataloader_num_workers: int = 0
+    load_best_model_at_end: bool = True
+    metric_for_best_model: str = "accuracy"
+    resume_from_checkpoint: str = None
+    run_name: str = "swin3d"
+    report_to: str = None
+    push_to_hub: bool = False
+    hub_model_id: str = None
+    hub_strategy: str = "checkpoint"
+    hub_private_repo: bool = True
+    def __post_init__(self):
+        self.output_dir = Path(self.output_dir)
+        if str(self.output_dir) == "experiments":
+            self.output_dir = self.output_dir / self.run_name
+        self.output_dir.mkdir(parents=True, exist_ok=True)
+        if self.hub_model_id is not None:
+            self.push_to_hub = True
+            if len(self.hub_model_id.split("/")) == 1:
+                self.hub_model_id = f"{self.hub_model_id}/{self.run_name}"
+@dataclass
+class InferenceConfig:
+    source: str = "webcam"
+    output_dir: str = "demo"
+    use_onnx: bool = False
+    device: str = "cpu"
+    cache_dir: str = "models/huggingface"
+    visualize: bool = False
+    show_skeleton: bool = False
+    visibility: float = 0.5
+    angle_threshold: int = 140
+    min_num_up_frames: int = 10
+    min_num_down_frames: int = 10
+    delay: int = 400
+    top_k: int = 3
+    # SL-GCN, DSTA-SLR specific
+    bone_stream: bool = False
+    motion_stream: bool = False
+    def __post_init__(self):
+        self.source = Path(self.source)
+        assert any((
+            str(self.source) == "webcam",
+            (self.source.exists() and str(self.source).endswith(VIDEO_EXTENSIONS))
+        )), \
+            f"Only Webcam and Video sources are supported for now (got {self.source})"
+        self.output_dir = Path(self.output_dir)
+        self.output_dir.mkdir(parents=True, exist_ok=True)

src/data/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .utils import *

src/data/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (178 Bytes). View file

src/data/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (212 Bytes). View file

src/data/__pycache__/utils.cpython-312.pyc ADDED Viewed

Binary file (7.1 kB). View file

src/data/__pycache__/utils.cpython-39.pyc ADDED Viewed

Binary file (3.61 kB). View file

src/data/utils.py ADDED Viewed

	@@ -0,0 +1,157 @@

+import numpy as np
+from mediapipe.python.solutions import pose
+from visualization import draw_text_on_image
+class Arm:
+    def __init__(
+        self,
+        side: str,
+        visibility: float = 0.5,
+    ) -> None:
+        if side == "left":
+            self.shoulde_idx = pose.PoseLandmark.LEFT_SHOULDER.value
+            self.elbow_idx = pose.PoseLandmark.LEFT_ELBOW.value
+            self.wrist_idx = pose.PoseLandmark.LEFT_WRIST.value
+        elif side == "right":
+            self.shoulde_idx = pose.PoseLandmark.RIGHT_SHOULDER.value
+            self.elbow_idx = pose.PoseLandmark.RIGHT_ELBOW.value
+            self.wrist_idx = pose.PoseLandmark.RIGHT_WRIST.value
+        else:
+            raise ValueError("Side must be either 'left' or 'right'")
+        self.visibility = visibility
+        self.is_up = False
+        self.num_up_frames = 0
+        self.num_down_frames = 0
+        self.start_time = 0
+        self.end_time = 0
+        self.shoulder = None
+        self.elbow = None
+        self.wrist = None
+        self.angle = 0
+    def reset_state(self) -> None:
+        self.is_up = False
+        self.num_up_frames = 0
+        self.num_down_frames = 0
+        self.start_time = 0
+        self.end_time = 0
+        self.shoulder = None
+        self.elbow = None
+        self.wrist = None
+        self.angle = 0
+    def set_pose(self, landmarks) -> bool:
+        if landmarks[self.shoulde_idx].visibility < self.visibility:
+            return False
+        self.shoulder = (
+            landmarks[self.shoulde_idx].x,
+            landmarks[self.shoulde_idx].y,
+        )
+        if landmarks[self.elbow_idx].visibility < self.visibility:
+            return False
+        self.elbow = (
+            landmarks[self.elbow_idx].x,
+            landmarks[self.elbow_idx].y,
+        )
+        if landmarks[self.wrist_idx].visibility < self.visibility:
+            return False
+        self.wrist = (
+            landmarks[self.wrist_idx].x,
+            landmarks[self.wrist_idx].y,
+        )
+        self.angle = calculate_angle(self.shoulder, self.elbow, self.wrist)
+        return True
+    def visualize(
+        self,
+        frame: np.ndarray,
+        position: tuple = (20, 50),
+        prefix: str = "Angle",
+        color: tuple = (0, 0, 255),
+    ) -> np.ndarray:
+        text = prefix + ": " + str(round(self.angle, 2))
+        return draw_text_on_image(
+            image=frame,
+            text=text,
+            position=position,
+            color=color,
+            font_size=20,
+        )
+def get_sample_timestamp(left_arm: Arm, right_arm: Arm) -> tuple:
+    start_time, end_time = 0, 0
+    left_arm_available = left_arm.start_time > 0 and left_arm.end_time > 0
+    right_arm_available = right_arm.start_time > 0 and right_arm.end_time > 0
+    if left_arm_available and right_arm.start_time == 0:
+        start_time = left_arm.start_time
+        end_time = left_arm.end_time
+    if right_arm_available and left_arm.start_time == 0:
+        start_time = right_arm.start_time
+        end_time = right_arm.end_time
+    if all((
+        left_arm_available, not left_arm.is_up,
+        right_arm_available, not right_arm.is_up,
+    )):
+        start_time = min(left_arm.start_time, right_arm.start_time)
+        end_time = max(left_arm.end_time, right_arm.end_time)
+    # Convert seconds to milliseconds
+    start_time /= 1000
+    end_time /= 1000
+    return start_time, end_time
+def calculate_angle(a: tuple, b: tuple, c: tuple) -> float:
+    a = np.array(a)     # First
+    b = np.array(b)     # Mid
+    c = np.array(c)     # End
+    radians = np.arctan2(c[1] - b[1], c[0] - b[0]) - np.arctan2(a[1] - b[1], a[0] - b[0])
+    angle = np.abs(radians * 180.0 / np.pi)
+    return 360 - angle if angle > 180 else angle
+def ok_to_get_frame(
+    arm: Arm,
+    angle_threshold: int,
+    min_num_up_frames: int,
+    min_num_down_frames: int,
+    current_time: int,
+    delay: int,
+) -> bool:
+    if 0 < arm.angle < angle_threshold:
+        if arm.is_up:
+            arm.num_down_frames = 0
+            arm.end_time = 0
+        else:
+            if arm.num_up_frames == min_num_up_frames:
+                arm.is_up = True
+                arm.num_up_frames = 0
+            else:
+                if arm.num_up_frames == 0:
+                    arm.start_time = current_time - delay
+                arm.num_up_frames += 1
+                return False
+    else:
+        if arm.is_up:
+            if arm.num_down_frames == min_num_down_frames:
+                arm.is_up = False
+                arm.num_down_frames = 0
+            else:
+                if arm.num_down_frames == 0:
+                    arm.end_time = current_time + delay
+                arm.num_down_frames += 1
+                return True
+        else:
+            arm.num_up_frames = 0
+            arm.start_time = 0
+    return arm.is_up

src/inference.py ADDED Viewed

	@@ -0,0 +1,271 @@

+import shutil
+import logging
+from time import time
+import numpy as np
+import pandas as pd
+import cv2
+from traceback import format_exc
+from argparse import Namespace
+from transformers import Pipeline
+from simple_parsing import ArgumentParser
+import mediapipe as mp
+from mediapipe.python.solutions.pose import PoseLandmark
+from mediapipe.python.solutions.hands import HandLandmark
+from mediapipe.python.solutions.drawing_utils import DrawingSpec
+from visualization import draw_text_on_image
+from configs import ModelConfig, InferenceConfig
+from utils import config_logger, POSE_BASED_MODELS
+from data import Arm, get_sample_timestamp, ok_to_get_frame
+from tools import load_pipeline, Predictions
+SPOTER_POSE_LANDMARKS = [
+    PoseLandmark.NOSE,
+    PoseLandmark.LEFT_EYE,
+    PoseLandmark.RIGHT_EYE,
+    PoseLandmark.RIGHT_SHOULDER,
+    PoseLandmark.LEFT_SHOULDER,
+    PoseLandmark.RIGHT_ELBOW,
+    PoseLandmark.LEFT_ELBOW,
+    PoseLandmark.RIGHT_WRIST,
+    PoseLandmark.LEFT_WRIST ]
+SPOTER_HAND_LANDMARKS = [
+    HandLandmark.WRIST,
+    HandLandmark.INDEX_FINGER_TIP, HandLandmark.INDEX_FINGER_DIP, HandLandmark.INDEX_FINGER_PIP, HandLandmark.INDEX_FINGER_MCP,
+    HandLandmark.MIDDLE_FINGER_TIP, HandLandmark.MIDDLE_FINGER_DIP, HandLandmark.MIDDLE_FINGER_PIP, HandLandmark.MIDDLE_FINGER_MCP,
+    HandLandmark.RING_FINGER_TIP, HandLandmark.RING_FINGER_DIP, HandLandmark.RING_FINGER_PIP, HandLandmark.RING_FINGER_MCP,
+    HandLandmark.PINKY_TIP, HandLandmark.PINKY_DIP, HandLandmark.PINKY_PIP, HandLandmark.PINKY_MCP,
+    HandLandmark.THUMB_TIP, HandLandmark.THUMB_IP, HandLandmark.THUMB_MCP, HandLandmark.THUMB_CMC,
+]
+def get_args() -> Namespace:
+    parser = ArgumentParser(
+        description="Train a model on VSL",
+        add_config_path_arg=True,
+    )
+    parser.add_arguments(ModelConfig, "model")
+    parser.add_arguments(InferenceConfig, "inference")
+    return parser.parse_args()
+def inference(model_config, inference_config: InferenceConfig, pipeline: Pipeline) -> None:
+    # Load video
+    source = str(inference_config.source) if inference_config.source.is_file() else 0
+    cap = cv2.VideoCapture(source)
+    if inference_config.output_dir is not None:
+        writer = cv2.VideoWriter(
+            str(inference_config.output_dir / "output.mp4"),
+            cv2.VideoWriter_fourcc(*"mp4v"),
+            cap.get(cv2.CAP_PROP_FPS),
+            (int(cap.get(3)), int(cap.get(4))),
+        )
+    # Init Mediapipe
+    mp_holistic = mp.solutions.holistic
+    mp_drawing = mp.solutions.drawing_utils
+    mp_drawing_styles = mp.solutions.drawing_styles
+    custom_pose_style = mp_drawing_styles.get_default_pose_landmarks_style()
+    custom_right_hand_style = mp_drawing_styles.get_default_hand_landmarks_style()
+    custom_left_hand_style = mp_drawing_styles.get_default_hand_landmarks_style()
+    custom_pose_connections = list(mp_holistic.POSE_CONNECTIONS)
+    custom_hand_connections = list(mp_holistic.HAND_CONNECTIONS)
+    if inference_config.show_skeleton:
+        # if model_config.arch == 'spoter':
+        pose_landmarks = SPOTER_POSE_LANDMARKS
+        hand_landmarks = SPOTER_HAND_LANDMARKS
+        for landmark in PoseLandmark:
+            if landmark in pose_landmarks:
+                custom_pose_style[landmark] = DrawingSpec(color=(0,255,0), thickness=2, circle_radius=2)
+            else:
+                custom_pose_style[landmark] = DrawingSpec(color=(0,0,0), thickness=0, circle_radius=0)
+                for connection_tuple in custom_pose_connections:
+                    if landmark.value in connection_tuple:
+                        custom_pose_connections.remove(connection_tuple)
+        for landmark in HandLandmark:
+            if landmark in hand_landmarks:
+                custom_right_hand_style[landmark] = DrawingSpec(color=(0,0,255), thickness=2, circle_radius=2)
+                custom_left_hand_style[landmark] = DrawingSpec(color=(255,0,0), thickness=2, circle_radius=2)
+            else:
+                custom_right_hand_style[HandLandmark[landmark.name]] = DrawingSpec(color=(0,0,0), thickness=0, circle_radius=0)
+                custom_left_hand_style[HandLandmark[landmark.name]] = DrawingSpec(color=(0,0,0), thickness=0, circle_radius=0)
+                for connection_tuple in custom_hand_connections:
+                    if landmark.value in connection_tuple:
+                        custom_hand_connections.remove(connection_tuple)
+    # Init variables
+    right_arm = Arm("right", inference_config.visibility)
+    left_arm = Arm("left", inference_config.visibility)
+    data = []
+    results = None
+    predictions = Predictions()
+    with mp_holistic.Holistic(min_detection_confidence=0.9, min_tracking_confidence=0.5) as holistic:
+        while cap.isOpened():
+            success, frame = cap.read()
+            if not success:
+                break
+            # Recolor image to RGB, because mp processes on RGB image
+            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+            frame.flags.writeable = False
+            # Make detections
+            detection_results = holistic.process(frame)
+            # Recolor image back to BGR, because cv2 processes on BGR image
+            frame.flags.writeable = True
+            frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
+            # Extract landmarks
+            try:
+                landmarks = detection_results.pose_landmarks.landmark
+            except Exception:
+                continue
+            left_arm.set_pose(landmarks)
+            right_arm.set_pose(landmarks)
+            # Check if arms are up or down
+            left_arm_ok_to_get_frame = ok_to_get_frame(
+                arm=left_arm,
+                angle_threshold=inference_config.angle_threshold,
+                min_num_up_frames=inference_config.min_num_up_frames,
+                min_num_down_frames=inference_config.min_num_down_frames,
+                current_time=cap.get(cv2.CAP_PROP_POS_MSEC),
+                delay=inference_config.delay,
+            )
+            right_arm_ok_to_get_frame = ok_to_get_frame(
+                arm=right_arm,
+                angle_threshold=inference_config.angle_threshold,
+                min_num_up_frames=inference_config.min_num_up_frames,
+                min_num_down_frames=inference_config.min_num_down_frames,
+                current_time=cap.get(cv2.CAP_PROP_POS_MSEC),
+                delay=inference_config.delay,
+            )
+            if left_arm_ok_to_get_frame or right_arm_ok_to_get_frame:
+                # logging.info("Frame added to the list")
+                predictions = Predictions()
+                data.append(detection_results if inference_config.use_pose_model else frame)
+            # Calculate the start and end time of sign
+            start_time, end_time = get_sample_timestamp(left_arm, right_arm)
+            # Convert from miliseconds to seconds
+            start_time /= 1_000
+            end_time /= 1_000
+            # logging.info(f"start_time: {start_time} - end_time: {end_time}")
+            # logging.info(f"\tLeft arm: {left_arm.start_time} - {left_arm.end_time} - {left_arm.is_up}")
+            # logging.info(f"\tRight arm: {right_arm.start_time} - {right_arm.end_time} - {right_arm.is_up}")
+            if start_time != 0 and end_time != 0:
+                # Render waiting screen
+                if inference_config.visualize:
+                    wait_frame = draw_text_on_image(
+                        np.zeros_like(frame),
+                        text="Please wait for the prediction...",
+                        position=(20, 20),
+                        color=(255, 255, 255),
+                        font_size=20,
+                    )
+                    cv2.imshow("Video Visualization", wait_frame)
+                    if cv2.waitKey(1) & 0xFF == ord('q'):
+                        break
+                start_inference_time = time()
+                predictions = Predictions(predictions=pipeline(np.array(data)))
+                predictions.inference_time = time() - start_inference_time
+                predictions.start_time = start_time
+                predictions.end_time = end_time
+                logging.info(str(predictions))
+                results = predictions.merge_results(results)
+                # Reset variables
+                start_time = 0
+                end_time = 0
+                left_arm.reset_state()
+                right_arm.reset_state()
+                data = []
+            # Render detections
+            frame = left_arm.visualize(frame, (20, 10), "Left arm angle")
+            frame = right_arm.visualize(frame, (20, 40), "Right arm angle")
+            frame = predictions.visualize(frame, (20, 70))
+            if inference_config.show_skeleton:
+                mp_drawing.draw_landmarks(
+                    frame,
+                    detection_results.pose_landmarks,
+                    connections = custom_pose_connections, #  passing the modified connections list
+                    landmark_drawing_spec=custom_pose_style) # and drawing style
+                mp_drawing.draw_landmarks(
+                    frame,
+                    detection_results.right_hand_landmarks,
+                    connections = custom_hand_connections, #  passing the modified connections list
+                    landmark_drawing_spec=custom_right_hand_style) # and drawing style
+                mp_drawing.draw_landmarks(
+                    frame,
+                    detection_results.left_hand_landmarks,
+                    connections = custom_hand_connections, #  passing the modified connections list
+                    landmark_drawing_spec=custom_left_hand_style) # and drawing style
+            if inference_config.output_dir is not None:
+                writer.write(frame)
+            if inference_config.visualize:
+                cv2.imshow("Video Visualization", frame)
+                if cv2.waitKey(1) & 0xFF == ord('q'):
+                    break
+    cap.release()
+    cv2.destroyAllWindows()
+    if inference_config.output_dir is not None:
+        writer.release()
+        logging.info(f"Video is recorded and saved to {inference_config.output_dir / 'output.avi'}")
+        pd.DataFrame(results).to_csv(inference_config.output_dir / "results.csv", index=False)
+        logging.info(f"Results saved to {inference_config.output_dir / 'results.csv'}")
+def main(args: Namespace) -> None:
+    model_config = args.model
+    logging.info(model_config)
+    inference_config = args.inference
+    logging.info(inference_config)
+    if model_config.arch in POSE_BASED_MODELS:
+        inference_config.use_pose_model = True
+    else:
+        inference_config.use_pose_model = False
+    pipeline = load_pipeline(model_config, inference_config)
+    logging.info("Pipeline loaded")
+    inference(model_config, inference_config, pipeline)
+    logging.info("Inference completed")
+if __name__ == "__main__":
+    try:
+        args = get_args()
+        config_logger(args.inference.output_dir / "inference.log")
+        logging.info(f"Config file loaded from {args.config_path[0]}")
+        shutil.copy(args.config_path[0], args.inference.output_dir / "inference.yaml")
+        logging.info(f"Config file saved to {args.inference.output_dir}")
+        main(args=args)
+    except Exception:
+        print(format_exc())

src/main.py ADDED Viewed

	@@ -0,0 +1,51 @@

+from fastapi import FastAPI, File, UploadFile, HTTPException
+from fastapi.responses import JSONResponse
+from pathlib import Path
+import shutil
+import logging
+from inference import inference, get_args
+from utils import config_logger
+from tools import load_pipeline
+from configs import ModelConfig, InferenceConfig
+app = FastAPI()
+@app.post("/upload-video/")
+async def upload_video(file: UploadFile = File(...)):
+    if not file.filename.endswith(('.mp4', '.avi', '.mov', '.mkv')):
+        raise HTTPException(status_code=400, detail="Invalid file type. Only video files are allowed.")
+    # Save the uploaded file to a temporary location
+    temp_file_path = Path(f"temp_{file.filename}")
+    with temp_file_path.open("wb") as buffer:
+        shutil.copyfileobj(file.file, buffer)
+    # Load configurations
+    args = get_args()
+    model_config = args.model
+    inference_config = args.inference
+    # Update the source to the uploaded file
+    inference_config.source = temp_file_path
+    # Configure logger
+    config_logger(inference_config.output_dir / "inference.log")
+    # Load the pipeline
+    pipeline = load_pipeline(model_config, inference_config)
+    # Run inference
+    try:
+        inference(model_config, inference_config, pipeline)
+    except Exception as e:
+        logging.error(f"Error during inference: {str(e)}")
+        raise HTTPException(status_code=500, detail="Error during video processing")
+    # Clean up the temporary file
+    temp_file_path.unlink()
+    return JSONResponse(content={"message": "Video processed successfully"})
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=8000)

src/tools/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+from .models import *
+from .features import *
+# from .utils import exists_on_hf

src/tools/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (203 Bytes). View file

src/tools/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (234 Bytes). View file

src/tools/__pycache__/features.cpython-39.pyc ADDED Viewed

Binary file (1.51 kB). View file

src/tools/__pycache__/models.cpython-312.pyc ADDED Viewed

Binary file (15.4 kB). View file

src/tools/__pycache__/models.cpython-39.pyc ADDED Viewed

Binary file (9.63 kB). View file

src/tools/features.py ADDED Viewed

	@@ -0,0 +1,29 @@

+import torch
+from configs import DataConfig
+from features import BaseDataset, VSL98Dataset, VSL400Dataset
+def load_dataset(data_config: DataConfig) -> BaseDataset:
+    '''
+    '''
+    datasets = {
+        'vsl_98': VSL98Dataset,
+        "vsl_400": VSL400Dataset,
+    }
+    return datasets[data_config.dataset](data_config)
+def rgb_collate_fn(examples) -> dict:
+    # permute to (num_frames, num_channels, height, width)
+    pixel_values = torch.stack(
+        [example["video"].permute(1, 0, 2, 3) for example in examples]
+    )
+    labels = torch.tensor([example["label"] for example in examples])
+    return {"pixel_values": pixel_values, "labels": labels}
+def pose_collate_fn(examples) -> dict:
+    # permute to (num_frames, num_channels, height, width)
+    poses = torch.stack([example["pose"] for example in examples])
+    labels = torch.tensor([example["label"] for example in examples])
+    return {"poses": poses, "labels": labels}

src/tools/models.py ADDED Viewed

	@@ -0,0 +1,441 @@

+import torch
+import logging
+import onnxruntime as ort
+from time import time
+from typing import Union
+from configs import ModelConfig, InferenceConfig
+from utils import (
+    POSE_BASED_MODELS,
+    RGB_BASED_MODELS,
+    HUGGINGFACE_RGB_BASED_MODELS,
+    TORCHHUB_RGB_BASED_MODELS,
+)
+from transformers import (
+    ImageProcessingMixin,
+    FeatureExtractionMixin,
+    AutoModelForVideoClassification,
+    AutoModel,
+    Pipeline,
+    pipeline,
+)
+from transformers.pipelines import PIPELINE_REGISTRY
+from visualization import draw_text_on_image
+from utils import exists_on_hf
+from models import (
+    Swin3DConfig, Swin3DImageProcessor, Swin3DForVideoClassification,
+    S3DConfig, S3DImageProcessor, S3DForVideoClassification,
+    VideoResNetConfig, VideoResNetImageProcessor, VideoResNetForVideoClassification,
+    MViTConfig, MViTImageProcessor, MViTForVideoClassification,
+    SLGCNConfig, SLGCNFeatureExtractor, SLGCNForGraphClassification,
+    SPOTERConfig, SPOTERFeatureExtractor, SPOTERForGraphClassification,
+    DSTASLRConfig, DSTASLRFeatureExtractor, DSTASLRForGraphClassification,
+    VideoMAEConfig, VideoMAEImageProcessor, VideoMAEForVideoClassification
+)
+from pipelines import (
+    VideoClassificationPipeline,
+    SLGCNGraphClassificationPipeline,
+    SPOTERGraphClassificationPipeline,
+)
+def load_model(
+    model_config: ModelConfig,
+    label2id: dict = None,
+    id2label: dict = None,
+    do_train: bool = False,
+) -> tuple:
+    '''
+    '''
+    if do_train:
+        if model_config.arch in POSE_BASED_MODELS:
+            return load_pose_model_for_training(model_config, label2id, id2label)
+        return load_rgb_model_for_training(model_config, label2id, id2label)
+    if model_config.arch in POSE_BASED_MODELS:
+        processor = FeatureExtractionMixin.from_pretrained(
+            model_config.pretrained,
+            trust_remote_code=True,
+            cache_dir="models/huggingface",
+        )
+        model = AutoModel.from_pretrained(
+            model_config.pretrained,
+            trust_remote_code=True,
+            cache_dir="models/huggingface",
+        )
+    else:
+        processor = ImageProcessingMixin.from_pretrained(
+            model_config.pretrained,
+            trust_remote_code=True,
+            cache_dir="models/huggingface",
+        )
+        model = AutoModelForVideoClassification.from_pretrained(
+            model_config.pretrained,
+            trust_remote_code=True,
+            cache_dir="models/huggingface",
+        )
+    model.eval()
+    return model.config, processor, model
+def load_rgb_model_for_training(
+    model_config: ModelConfig,
+    label2id: dict = None,
+    id2label: dict = None,
+) -> tuple:
+    '''
+    '''
+    if model_config.arch in HUGGINGFACE_RGB_BASED_MODELS:
+        if model_config.arch == "videomae":
+            config_class = VideoMAEConfig
+            processor_class = VideoMAEImageProcessor
+            model_class = VideoMAEForVideoClassification
+    elif exists_on_hf(model_config.pretrained):
+        processor = ImageProcessingMixin.from_pretrained(
+            model_config.pretrained,
+            trust_remote_code=True,
+            cache_dir="models/huggingface",
+        )
+        model = AutoModelForVideoClassification.from_pretrained(
+            model_config.pretrained,
+            label2id,
+            id2label,
+            ignore_mismatched_sizes=True,
+            trust_remote_code=True,
+            cache_dir="models/huggingface",
+        )
+        return model.config, processor, model
+    elif model_config.arch in TORCHHUB_RGB_BASED_MODELS:
+        if model_config.arch in ['swin3d_t', 'swin3d_s', 'swin3d_b']:
+            config_class = Swin3DConfig
+            processor_class = Swin3DImageProcessor
+            model_class = Swin3DForVideoClassification
+        elif model_config.arch in ['r3d_18', 'mc3_18', 'r2plus1d_18']:
+            config_class = VideoResNetConfig
+            processor_class = VideoResNetImageProcessor
+            model_class = VideoResNetForVideoClassification
+        elif model_config.arch in ['s3d']:
+            config_class = S3DConfig
+            processor_class = S3DImageProcessor
+            model_class = S3DForVideoClassification
+        elif model_config.arch in ['mvit_v1_b', 'mvit_v2_s']:
+            config_class = MViTConfig
+            processor_class = MViTImageProcessor
+            model_class = MViTForVideoClassification
+    else:
+        logging.error(f"Model {model_config.arch} is not supported")
+        exit(1)
+    config_class.register_for_auto_class()
+    processor_class.register_for_auto_class("AutoImageProcessor")
+    model_class.register_for_auto_class("AutoModel")
+    model_class.register_for_auto_class("AutoModelForVideoClassification")
+    logging.info(f"{model_config.arch} classes registered")
+    config = config_class(**vars(model_config))
+    processor = processor_class(config=config)
+    model = model_class(config=config, label2id=label2id, id2label=id2label)
+    return config, processor, model
+def load_pose_model_for_training(
+    model_config: ModelConfig,
+    label2id: dict = None,
+    id2label: dict = None,
+) -> tuple:
+    '''
+    '''
+    if exists_on_hf(model_config.pretrained):
+        processor = FeatureExtractionMixin.from_pretrained(
+            model_config.pretrained,
+            trust_remote_code=True,
+            cache_dir="models/huggingface",
+        )
+        model = AutoModel.from_pretrained(
+            model_config.pretrained,
+            label2id=label2id,
+            id2label=id2label,
+            ignore_mismatched_sizes=True,
+            trust_remote_code=True,
+            cache_dir="models/huggingface",
+        )
+        return model.config, processor, model
+    elif model_config.arch in POSE_BASED_MODELS:
+        if model_config.arch == "spoter":
+            config_class = SPOTERConfig
+            processor_class = SPOTERFeatureExtractor
+            model_class = SPOTERForGraphClassification
+        elif model_config.arch == "sl_gcn":
+            config_class = SLGCNConfig
+            processor_class = SLGCNFeatureExtractor
+            model_class = SLGCNForGraphClassification
+        elif model_config.arch == "dsta_slr":
+            config_class = DSTASLRConfig
+            processor_class = DSTASLRFeatureExtractor
+            model_class = DSTASLRForGraphClassification
+    else:
+        logging.error(f"Model {model_config.arch} is not supported")
+        exit(1)
+    config_class.register_for_auto_class()
+    processor_class.register_for_auto_class("AutoFeatureExtractor")
+    model_class.register_for_auto_class("AutoModel")
+    logging.info(F"Registering {model_config.arch} classes")
+    config = config_class(**vars(model_config))
+    processor = processor_class(config=config)
+    model = model_class(config=config, label2id=label2id, id2label=id2label)
+    return config, processor, model
+class Predictions:
+    def __init__(
+        self,
+        predictions: list[dict] = None,
+        inference_time: float = 0,
+        start_time: float = 0,
+        end_time: float = 0,
+    ) -> None:
+        self.predictions = predictions
+        self.inference_time = inference_time
+        self.start_time = start_time
+        self.end_time = end_time
+    def visualize(
+        self,
+        frame: torch.Tensor,
+        position: tuple = (20, 100),
+        prefix: str = "Predictions",
+        color: tuple = (0, 0, 255),
+    ) -> None:
+        text = prefix + ": " + self.get_pred_message()
+        return draw_text_on_image(
+            image=frame,
+            text=text,
+            position=position,
+            color=color,
+            font_size=20,
+        )
+    def get_pred_message(self) -> str:
+        if not any((
+            self.start_time,
+            self.end_time,
+            self.inference_time,
+            self.predictions
+        )):
+            return ""
+        return ', '.join(
+            [
+                f"{pred['gloss']} ({pred['score']*100:.2f}%)"
+                for pred in self.predictions
+            ]
+        )
+    def __str__(self) -> str:
+        if not any((
+            self.start_time,
+            self.end_time,
+            self.inference_time,
+            self.predictions
+        )):
+            return ""
+        predictions = self.get_pred_message()
+        message = "Sample start: {:.2f}s - end: {:.2f}s | Runtime: {:.2f}s | Predictions: {}"
+        return message.format(self.start_time, self.end_time, self.inference_time, predictions)
+    def merge_results(self, results: dict = None) -> dict:
+        if results is None:
+            results = {
+                "start_time": [],
+                "end_time": [],
+                "inference_time": [],
+                "prediction": [],
+            }
+        results["start_time"].append(self.start_time)
+        results["end_time"].append(self.end_time)
+        results["inference_time"].append(self.inference_time)
+        results["prediction"].append(self.predictions)
+        return results
+def get_predictions(
+    inputs: torch.Tensor,
+    model: Union[ort.InferenceSession, AutoModel],
+    id2gloss: dict,
+    k: int = 3,
+) -> Predictions:
+    '''
+    Get the top-k predictions.
+    Parameters
+    ----------
+    inputs : torch.Tensor
+        Model inputs (Time, Height, Width, Channels).
+    model : Union[ort.InferenceSession, AutoModel]
+        Model to get predictions from.
+    id2gloss : dict
+        Mapping of class indices to glosses.
+    k : int, optional
+        Number of predictions to return, by default 3.
+    Returns
+    -------
+    tuple
+        List of top-k predictions and inference time.
+    '''
+    if inputs is None:
+        return Predictions()
+    # Get logits
+    start_time = time()
+    if isinstance(model, ort.InferenceSession):
+        inputs = inputs.cpu().numpy()
+        logits = torch.from_numpy(model.run(None, {"pixel_values": inputs})[0])
+    else:
+        logits = model(inputs.to(model.device)).logits
+    inference_time = time() - start_time
+    # Get top-3 predictions
+    topk_scores, topk_indices = torch.topk(logits, k, dim=1)
+    topk_scores = torch.nn.functional.softmax(topk_scores, dim=1).squeeze().detach().numpy()
+    topk_indices = topk_indices.squeeze().detach().numpy()
+    predictions = [
+        {
+            'gloss': id2gloss[str(topk_indices[i])],
+            'score': topk_scores[i],
+        }
+        for i in range(k)
+    ]
+    return Predictions(predictions=predictions, inference_time=inference_time)
+def register_pipeline(model_config: ModelConfig) -> Pipeline:
+    '''
+    '''
+    _, processor, model = load_model(model_config)
+    if model_config.arch == "spoter":
+        PIPELINE_REGISTRY.register_pipeline(
+            "video-classification",
+            pipeline_class=SPOTERGraphClassificationPipeline,
+            pt_model=AutoModel,
+            default={"pt": ("vsltranslation/spoter_v3.0", "main")},
+            type="multimodal",
+        )
+        return SPOTERGraphClassificationPipeline(
+            model=model,
+            feature_extractor=processor,
+        )
+    elif model_config.arch in ["sl_gcn", "dsta_slr"]:
+        PIPELINE_REGISTRY.register_pipeline(
+            "video-classification",
+            pipeline_class=SLGCNGraphClassificationPipeline,
+            pt_model=AutoModel,
+            default={"pt": ("vsltranslation/sl_gcn_joint_v1.0", "main")},
+            type="multimodal",
+        )
+        return SLGCNGraphClassificationPipeline(
+            model=model,
+            feature_extractor=processor,
+        )
+    PIPELINE_REGISTRY.register_pipeline(
+        "video-classification",
+        pipeline_class=VideoClassificationPipeline,
+        pt_model=AutoModelForVideoClassification,
+        default={"pt": ("vsltranslation/swin3d_t_v1.0", "main")},
+        type="multimodal",
+    )
+    return VideoClassificationPipeline(
+        model=model,
+        image_processor=processor,
+    )
+def load_pipeline(
+    model_config: ModelConfig,
+    inference_config: InferenceConfig,
+) -> Pipeline:
+    '''
+    '''
+    if model_config.arch in POSE_BASED_MODELS:
+        return pipeline(
+            "video-classification",
+            model=model_config.pretrained,
+            feature_extractor=model_config.pretrained,
+            device=inference_config.device,
+            model_kwargs={
+                "cache_dir": inference_config.cache_dir,
+            },
+            trust_remote_code=True,
+            use_onnx=inference_config.use_onnx,
+            top_k=inference_config.top_k,
+            bone_stream=inference_config.bone_stream,
+            motion_stream=inference_config.motion_stream,
+        )
+    return pipeline(
+        "video-classification",
+        model=model_config.pretrained,
+        image_processor=model_config.pretrained,
+        device=inference_config.device,
+        model_kwargs={
+            "cache_dir": inference_config.cache_dir,
+        },
+        trust_remote_code=True,
+        use_onnx=inference_config.use_onnx,
+        top_k=inference_config.top_k,
+    )
+def get_input_shape(
+    arch: str,
+    processor: Union[ImageProcessingMixin, FeatureExtractionMixin],
+    batch_size: int = 1,
+) -> tuple:
+    '''
+    Get the input shape for the model.
+    Parameters
+    ----------
+    processor : Union[ImageProcessingMixin, FeatureExtractionMixin]
+        Model processor.
+    batch_size : int, optional
+        Batch size, by default 1.
+    Returns
+    -------
+    tuple
+        Input shape.
+    '''
+    if arch in RGB_BASED_MODELS:
+        return (
+            batch_size,
+            processor.num_frames,
+            3,
+            processor.size["height"],
+            processor.size["width"]
+        )
+    elif arch in POSE_BASED_MODELS:
+        if arch == "spoter":
+            return (
+                batch_size,
+                processor.num_frames,
+                processor.num_points,
+                processor.in_channels,
+            )
+        elif arch in ["sl_gcn", "dsta_slr"]:
+            return (
+                batch_size,
+                processor.in_channels,
+                processor.window_size,
+                processor.num_points,
+                processor.num_people,
+            )
+        else:
+            logging.error(f"Model {arch} is not supported")
+            exit(1)
+    else:
+        logging.error(f"Model {arch} is not supported")
+        exit(1)

src/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .loggers import *
2	+ from .constants import *

src/utils/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (205 Bytes). View file

src/utils/__pycache__/constants.cpython-312.pyc ADDED Viewed

Binary file (4.35 kB). View file

src/utils/__pycache__/loggers.cpython-312.pyc ADDED Viewed

Binary file (1.59 kB). View file

src/utils/constants.py ADDED Viewed

	@@ -0,0 +1,158 @@

+import numpy as np
+VIDEO_EXTENSIONS = (".mp4", ".avi", ".mov", ".mkv")
+TORCHHUB_RGB_BASED_MODELS = (
+    'swin3d_t',
+    'swin3d_s',
+    'swin3d_b',
+    "r3d_18",
+    "mc3_18",
+    "r2plus1d_18",
+    "s3d",
+    "mvit_v1_b",
+    "mvit_v2_s",
+)
+HUGGINGFACE_RGB_BASED_MODELS = (
+    "videomae",
+)
+RGB_BASED_MODELS = HUGGINGFACE_RGB_BASED_MODELS + TORCHHUB_RGB_BASED_MODELS
+POSE_BASED_MODELS = (
+    "spoter",
+    "sl_gcn",
+    "dsta_slr"
+)
+MODELS = RGB_BASED_MODELS + POSE_BASED_MODELS
+HAND_LANDMARKS = [
+    "wrist",
+    "indexTip",
+    "indexDIP",
+    "indexPIP",
+    "indexMCP",
+    "middleTip",
+    "middleDIP",
+    "middlePIP",
+    "middleMCP",
+    "ringTip",
+    "ringDIP",
+    "ringPIP",
+    "ringMCP",
+    "littleTip",
+    "littleDIP",
+    "littlePIP",
+    "littleMCP",
+    "thumbTip",
+    "thumbIP",
+    "thumbMP",
+    "thumbCMC",
+]
+BODY_LANDMARKS = [
+    "nose",
+    "neck",
+    "rightEye",
+    "leftEye",
+    "rightEar",
+    "leftEar",
+    "rightShoulder",
+    "leftShoulder",
+    "rightElbow",
+    "leftElbow",
+    "rightWrist",
+    "leftWrist",
+]
+ARM_LANDMARKS_ORDER = ["neck", "$side$Shoulder", "$side$Elbow", "$side$Wrist"]
+FLIP_IDXS = np.concatenate(
+    (
+        [0, 2, 1, 4, 3, 6, 5],
+        [17, 18, 19, 20, 21, 22, 23, 24, 25, 26],
+        [7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ),
+    axis=0,
+)
+SLGCN_JOINTS = {
+    59: np.concatenate((np.arange(0, 17), np.arange(91, 133)), axis=0),  # 59
+    31: np.concatenate(
+        (
+            np.arange(0, 11),
+            [91, 95, 96, 99, 100, 103, 104, 107, 108, 111],
+            [112, 116, 117, 120, 121, 124, 125, 128, 129, 132],
+        ),
+        axis=0,
+    ),  # 31
+    27: np.concatenate(
+        (
+            [0, 5, 6, 7, 8, 9, 10],
+            [91, 95, 96, 99, 100, 103, 104, 107, 108, 111],
+            [112, 116, 117, 120, 121, 124, 125, 128, 129, 132],
+        ),
+        axis=0,
+    ),  # 27
+}
+COCO_TO_POSE_FORMAT = {
+    0: ("POSE_LANDMARKS", "NOSE"),
+    1: ("POSE_LANDMARKS", "LEFT_EYE"),
+    2: ("POSE_LANDMARKS", "RIGHT_EYE"),
+    3: ("POSE_LANDMARKS", "LEFT_EAR"),
+    4: ("POSE_LANDMARKS", "RIGHT_EAR"),
+    5: ("POSE_LANDMARKS", "LEFT_SHOULDER"),
+    6: ("POSE_LANDMARKS", "RIGHT_SHOULDER"),
+    7: ("POSE_LANDMARKS", "LEFT_ELBOW"),
+    8: ("POSE_LANDMARKS", "RIGHT_ELBOW"),
+    9: ("POSE_LANDMARKS", "LEFT_WRIST"),
+    10: ("POSE_LANDMARKS", "RIGHT_WRIST"),
+    11: ("POSE_LANDMARKS", "LEFT_HIP"),
+    12: ("POSE_LANDMARKS", "RIGHT_HIP"),
+    13: ("POSE_LANDMARKS", "LEFT_KNEE"),
+    14: ("POSE_LANDMARKS", "RIGHT_KNEE"),
+    15: ("POSE_LANDMARKS", "LEFT_ANKLE"),
+    16: ("POSE_LANDMARKS", "RIGHT_ANKLE"),
+    91: ("LEFT_HAND_LANDMARKS", "WRIST"),
+    92: ("LEFT_HAND_LANDMARKS", "THUMB_CMC"),
+    93: ("LEFT_HAND_LANDMARKS", "THUMB_MCP"),
+    94: ("LEFT_HAND_LANDMARKS", "THUMB_IP"),
+    95: ("LEFT_HAND_LANDMARKS", "THUMB_TIP"),
+    96: ("LEFT_HAND_LANDMARKS", "INDEX_FINGER_MCP"),
+    97: ("LEFT_HAND_LANDMARKS", "INDEX_FINGER_PIP"),
+    98: ("LEFT_HAND_LANDMARKS", "INDEX_FINGER_DIP"),
+    99: ("LEFT_HAND_LANDMARKS", "INDEX_FINGER_TIP"),
+    100: ("LEFT_HAND_LANDMARKS", "MIDDLE_FINGER_MCP"),
+    101: ("LEFT_HAND_LANDMARKS", "MIDDLE_FINGER_PIP"),
+    102: ("LEFT_HAND_LANDMARKS", "MIDDLE_FINGER_DIP"),
+    103: ("LEFT_HAND_LANDMARKS", "MIDDLE_FINGER_TIP"),
+    104: ("LEFT_HAND_LANDMARKS", "RING_FINGER_MCP"),
+    105: ("LEFT_HAND_LANDMARKS", "RING_FINGER_PIP"),
+    106: ("LEFT_HAND_LANDMARKS", "RING_FINGER_DIP"),
+    107: ("LEFT_HAND_LANDMARKS", "RING_FINGER_TIP"),
+    108: ("LEFT_HAND_LANDMARKS", "PINKY_MCP"),
+    109: ("LEFT_HAND_LANDMARKS", "PINKY_PIP"),
+    110: ("LEFT_HAND_LANDMARKS", "PINKY_DIP"),
+    111: ("LEFT_HAND_LANDMARKS", "PINKY_TIP"),
+    112: ("RIGHT_HAND_LANDMARKS", "WRIST"),
+    113: ("RIGHT_HAND_LANDMARKS", "THUMB_CMC"),
+    114: ("RIGHT_HAND_LANDMARKS", "THUMB_MCP"),
+    115: ("RIGHT_HAND_LANDMARKS", "THUMB_IP"),
+    116: ("RIGHT_HAND_LANDMARKS", "THUMB_TIP"),
+    117: ("RIGHT_HAND_LANDMARKS", "INDEX_FINGER_MCP"),
+    118: ("RIGHT_HAND_LANDMARKS", "INDEX_FINGER_PIP"),
+    119: ("RIGHT_HAND_LANDMARKS", "INDEX_FINGER_DIP"),
+    120: ("RIGHT_HAND_LANDMARKS", "INDEX_FINGER_TIP"),
+    121: ("RIGHT_HAND_LANDMARKS", "MIDDLE_FINGER_MCP"),
+    122: ("RIGHT_HAND_LANDMARKS", "MIDDLE_FINGER_PIP"),
+    123: ("RIGHT_HAND_LANDMARKS", "MIDDLE_FINGER_DIP"),
+    124: ("RIGHT_HAND_LANDMARKS", "MIDDLE_FINGER_TIP"),
+    125: ("RIGHT_HAND_LANDMARKS", "RING_FINGER_MCP"),
+    126: ("RIGHT_HAND_LANDMARKS", "RING_FINGER_PIP"),
+    127: ("RIGHT_HAND_LANDMARKS", "RING_FINGER_DIP"),
+    128: ("RIGHT_HAND_LANDMARKS", "RING_FINGER_TIP"),
+    129: ("RIGHT_HAND_LANDMARKS", "PINKY_MCP"),
+    130: ("RIGHT_HAND_LANDMARKS", "PINKY_PIP"),
+    131: ("RIGHT_HAND_LANDMARKS", "PINKY_DIP"),
+    132: ("RIGHT_HAND_LANDMARKS", "PINKY_TIP"),
+}

src/utils/loggers.py ADDED Viewed

	@@ -0,0 +1,24 @@

+import sys
+import logging
+from pathlib import Path
+from transformers import TrainerCallback
+class TrainingCallback(TrainerCallback):
+    def on_log(self, args, state, control, logs=None, **kwargs):
+        logging.info(logs)
+def config_logger(log_file: str = None) -> None:
+    handlers = [logging.StreamHandler(sys.stdout)]
+    if log_file is not None:
+        log_dir = Path(log_file).parent
+        if not log_dir.exists():
+            log_dir.mkdir(parents=True, exist_ok=True)
+        handlers.append(logging.FileHandler(filename=log_file))
+    logging.basicConfig(
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+        format="[%(asctime)s] {%(filename)s:%(lineno)d} %(levelname)s - %(message)s",
+        handlers=handlers
+    )

src/visualization/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .utils import *

src/visualization/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (187 Bytes). View file

src/visualization/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (221 Bytes). View file

src/visualization/__pycache__/utils.cpython-312.pyc ADDED Viewed

Binary file (2.44 kB). View file

src/visualization/__pycache__/utils.cpython-39.pyc ADDED Viewed

Binary file (1.7 kB). View file

src/visualization/utils.py ADDED Viewed

	@@ -0,0 +1,55 @@

+import torch
+import numpy as np
+from imageio import mimsave
+from PIL import Image, ImageDraw, ImageFont
+def unnormalize_img(image: np.ndarray, std: tuple, mean: tuple) -> np.ndarray:
+    image = (image * std) + mean
+    image = (image * 255).astype('uint8')
+    return image.clip(0, 255)
+def save_as_gif(
+    video_tensor: torch.Tensor,
+    save_path: str = 'sample.gif',
+    std: tuple = None,
+    mean: tuple = None,
+):
+    frames = []
+    for video_frame in video_tensor:
+        frame_unnormalized = unnormalize_img(
+            image=video_frame.permute(1, 2, 0).numpy(),
+            std=std,
+            mean=mean,
+        )
+        frames.append(frame_unnormalized)
+    kargs = {'duration': 0.25}
+    mimsave(save_path, frames, 'GIF', **kargs)
+    return save_path
+def display_gif(gif_path: str) -> Image:
+    return Image(filename=gif_path)
+def draw_text_on_image(
+    image: np.ndarray,
+    text: str,
+    position: tuple = (20, 20),
+    color: tuple = (0, 0, 255),
+    font_size: int = 20,
+) -> np.ndarray:
+    font = ImageFont.truetype(
+        font="fonts/OpenSans-Regular.ttf",
+        size=font_size,
+    )
+    pil_image = Image.fromarray(image)
+    draw = ImageDraw.Draw(pil_image)
+    draw.text(
+        xy=position,
+        text=text,
+        fill=color,
+        font=font,
+    )
+    return np.array(pil_image)