video-classification-Kitnetics400-myViT

Runtime error

App Files Files Community

JackWong0911 commited on Apr 9, 2024

Commit

ba5befc

verified ·

1 Parent(s): d999fef

Update app.py

Browse files

Files changed (1) hide show

app.py +491 -10

app.py CHANGED Viewed

@@ -18,13 +18,492 @@ from torchvision.transforms import (
     RandomHorizontalFlip,
     Resize,
 )
-from transformers import VideoMAEFeatureExtractor, VideoMAEForVideoClassification
-MODEL_CKPT = "sayakpaul/videomae-base-finetuned-kinetics-finetuned-ucf101-subset"
 DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-MODEL = VideoMAEForVideoClassification.from_pretrained(MODEL_CKPT).to(DEVICE)
-PROCESSOR = VideoMAEFeatureExtractor.from_pretrained(MODEL_CKPT)
 RESIZE_TO = PROCESSOR.size["shortest_edge"]
 NUM_FRAMES_TO_SAMPLE = MODEL.config.num_frames
@@ -122,17 +601,19 @@ gr.Interface(
     inputs=gr.Video(type="file"),
     outputs=gr.Label(num_top_classes=3),
     examples=[
-        ["examples/babycrawling.mp4"],
-        ["examples/baseball.mp4"],
-        ["examples/balancebeam.mp4"],
     ],
-    title="VideoMAE fine-tuned on a subset of UCF-101",
     description=(
-        "Gradio demo for VideoMAE for video classification. To use it, simply upload your video or click one of the"
         " examples to load them. Read more at the links below."
     ),
     article=(
-        "<div style='text-align: center;'><a href='https://huggingface.co/docs/transformers/model_doc/videomae' target='_blank'>VideoMAE</a>"
         " <center><a href='https://huggingface.co/sayakpaul/videomae-base-finetuned-kinetics-finetuned-ucf101-subset' target='_blank'>Fine-tuned Model</a></center></div>"
     ),
     allow_flagging=False,

     RandomHorizontalFlip,
     Resize,
 )
+# my code below
+# import transformers.models.timesformer.modeling_timesformer
+from transformers.models.timesformer.modeling_timesformer import TimeSformerDropPath, TimeSformerAttention, TimesformerIntermediate, TimesformerOutput, TimesformerLayer, TimesformerEncoder, TimesformerModel, TIMESFORMER_INPUTS_DOCSTRING, _CONFIG_FOR_DOC, TimesformerEmbeddings, TimesformerForVideoClassification
+from transformers import TimesformerConfig
+configuration = TimesformerConfig()
+import collections
+from typing import Optional, Tuple, Union
+import torch
+import torch.nn.functional
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+from transformers.activations import ACT2FN
+from transformers.modeling_outputs import BaseModelOutput, ImageClassifierOutput
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
+from transformers.models.timesformer.configuration_timesformer import TimesformerConfig
+class MyTimesformerLayer(TimesformerLayer):
+    def __init__(self, config: configuration, layer_index: int) -> None:
+        super().__init__()
+        attention_type = config.attention_type
+        drop_path_rates = [
+            x.item() for x in torch.linspace(0, config.drop_path_rate, config.num_hidden_layers)
+        ]  # stochastic depth decay rule
+        drop_path_rate = drop_path_rates[layer_index]
+        self.drop_path = TimeSformerDropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity()
+        self.attention = TimeSformerAttention(config)
+        self.intermediate = TimesformerIntermediate(config)
+        self.output = TimesformerOutput(config)
+        self.layernorm_before = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.layernorm_after = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.config = config
+        self.attention_type = attention_type
+        if attention_type not in ["divided_space_time", "space_only", "joint_space_time"]:
+            raise ValueError("Unknown attention type: {}".format(attention_type))
+        # Temporal Attention Parameters
+        if self.attention_type == "divided_space_time":
+            self.temporal_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+            self.temporal_attention = TimeSformerAttention(config)
+            self.temporal_dense = nn.Linear(config.hidden_size, config.hidden_size)
+    def forward(self, hidden_states: torch.Tensor, output_attentions: bool = False):
+        num_frames = self.config.num_frames
+        num_patch_width = self.config.image_size // self.config.patch_size
+        batch_size = hidden_states.shape[0]
+        num_spatial_tokens = (hidden_states.size(1) - 1) // num_frames
+        num_patch_height = num_spatial_tokens // num_patch_width
+        if self.attention_type in ["space_only", "joint_space_time"]:
+            self_attention_outputs = self.attention(
+                self.layernorm_before(hidden_states), output_attentions=output_attentions
+            )
+            attention_output = self_attention_outputs[0]
+            outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+            hidden_states = hidden_states + self.drop_path(attention_output)
+            layer_output = self.layernorm_after(hidden_states)
+            layer_output = self.intermediate(layer_output)
+            layer_output = self.output(layer_output)
+            layer_output = hidden_states + self.drop_path(layer_output)
+            outputs = (layer_output,) + outputs
+            return outputs
+        elif self.attention_type == "divided_space_time":
+            # Spatial
+            init_cls_token = hidden_states[:, 0, :].unsqueeze(1)
+            cls_token = init_cls_token.repeat(1, num_frames, 1)
+            cls_token = cls_token.reshape(batch_size * num_frames, 1, cls_token.shape[2])
+            spatial_embedding = hidden_states[:, 1:, :]
+            spatial_embedding = (
+                spatial_embedding.reshape(
+                    batch_size, num_patch_height, num_patch_width, num_frames, spatial_embedding.shape[2]
+                )
+                .permute(0, 3, 1, 2, 4)
+                .reshape(batch_size * num_frames, num_patch_height * num_patch_width, spatial_embedding.shape[2])
+            )
+            spatial_embedding = torch.cat((cls_token, spatial_embedding), 1)
+            spatial_attention_outputs = self.attention(
+                self.layernorm_before(spatial_embedding), output_attentions=output_attentions
+            )
+            attention_output = spatial_attention_outputs[0]
+            outputs = spatial_attention_outputs[1:]  # add self attentions if we output attention weights
+            residual_spatial = self.drop_path(attention_output)
+            # Taking care of CLS token
+            cls_token = residual_spatial[:, 0, :]
+            cls_token = cls_token.reshape(batch_size, num_frames, cls_token.shape[1])
+            cls_token = torch.mean(cls_token, 1, True)  # averaging for every frame
+            residual_spatial = residual_spatial[:, 1:, :]
+            residual_spatial = (
+                residual_spatial.reshape(
+                    batch_size, num_frames, num_patch_height, num_patch_width, residual_spatial.shape[2]
+                )
+                .permute(0, 2, 3, 1, 4)
+                .reshape(batch_size, num_patch_height * num_patch_width * num_frames, residual_spatial.shape[2])
+            )
+            residual = residual_spatial
+            hidden_states = hidden_states[:, 1:, :] + residual_spatial
+            # Temporal
+            temporal_embedding = hidden_states
+            temporal_embedding = temporal_embedding.reshape(
+                batch_size, num_patch_height, num_patch_width, num_frames, temporal_embedding.shape[2]
+            ).reshape(batch_size * num_patch_height * num_patch_width, num_frames, temporal_embedding.shape[2])
+            temporal_attention_outputs = self.temporal_attention(
+                self.temporal_layernorm(temporal_embedding),
+            )
+            attention_output = temporal_attention_outputs[0]
+            residual_temporal = self.drop_path(attention_output)
+            residual_temporal = residual_temporal.reshape(
+                batch_size, num_patch_height, num_patch_width, num_frames, residual_temporal.shape[2]
+            ).reshape(batch_size, num_patch_height * num_patch_width * num_frames, residual_temporal.shape[2])
+            residual_temporal = self.temporal_dense(residual_temporal)
+            hidden_states = hidden_states + residual_temporal
+            # Mlp
+            hidden_states = torch.cat((init_cls_token, hidden_states), 1) + torch.cat((cls_token, residual_temporal), 1)
+            layer_output = self.layernorm_after(hidden_states)
+            layer_output = self.intermediate(layer_output)
+            layer_output = self.output(layer_output)
+            layer_output = hidden_states + self.drop_path(layer_output)
+            outputs = (layer_output,) + outputs
+            return outputs
+import transformers.models.timesformer.modeling_timesformer
+class MyTimesformerEncoder(TimesformerEncoder):
+    def __init__(self, config: configuration) -> None:
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([MyTimesformerLayer(config, ind) for ind in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ) -> Union[tuple, BaseModelOutput]:
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    layer_module.__call__,
+                    hidden_states,
+                    output_attentions,
+                )
+            else:
+                layer_outputs = layer_module(hidden_states, output_attentions)
+            hidden_states = layer_outputs[0]
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+class MyTimesformerModel(TimesformerModel):
+    def __init__(self, config: configuration):
+        super().__init__(config)
+        self.config = config
+        self.embeddings = TimesformerEmbeddings(config)
+        self.encoder = TimesformerEncoder(config)
+        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.embeddings.patch_embeddings
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+    @add_start_docstrings_to_model_forward(TIMESFORMER_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.FloatTensor], BaseModelOutput]:
+        r"""
+        Returns:
+        Examples:
+        ```python
+        >>> import av
+        >>> import numpy as np
+        >>> from transformers import AutoImageProcessor, TimesformerModel
+        >>> from huggingface_hub import hf_hub_download
+        >>> np.random.seed(0)
+        >>> def read_video_pyav(container, indices):
+        ...     '''
+        ...     Decode the video with PyAV decoder.
+        ...     Args:
+        ...         container (`av.container.input.InputContainer`): PyAV container.
+        ...         indices (`List[int]`): List of frame indices to decode.
+        ...     Returns:
+        ...         result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
+        ...     '''
+        ...     frames = []
+        ...     container.seek(0)
+        ...     start_index = indices[0]
+        ...     end_index = indices[-1]
+        ...     for i, frame in enumerate(container.decode(video=0)):
+        ...         if i > end_index:
+        ...             break
+        ...         if i >= start_index and i in indices:
+        ...             frames.append(frame)
+        ...     return np.stack([x.to_ndarray(format="rgb24") for x in frames])
+        >>> def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
+        ...     '''
+        ...     Sample a given number of frame indices from the video.
+        ...     Args:
+        ...         clip_len (`int`): Total number of frames to sample.
+        ...         frame_sample_rate (`int`): Sample every n-th frame.
+        ...         seg_len (`int`): Maximum allowed index of sample's last frame.
+        ...     Returns:
+        ...         indices (`List[int]`): List of sampled frame indices
+        ...     '''
+        ...     converted_len = int(clip_len * frame_sample_rate)
+        ...     end_idx = np.random.randint(converted_len, seg_len)
+        ...     start_idx = end_idx - converted_len
+        ...     indices = np.linspace(start_idx, end_idx, num=clip_len)
+        ...     indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
+        ...     return indices
+        >>> # video clip consists of 300 frames (10 seconds at 30 FPS)
+        >>> file_path = hf_hub_download(
+        ...     repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
+        ... )
+        >>> container = av.open(file_path)
+        >>> # sample 8 frames
+        >>> indices = sample_frame_indices(clip_len=8, frame_sample_rate=4, seg_len=container.streams.video[0].frames)
+        >>> video = read_video_pyav(container, indices)
+        >>> image_processor = AutoImageProcessor.from_pretrained("MCG-NJU/videomae-base")
+        >>> model = TimesformerModel.from_pretrained("facebook/timesformer-base-finetuned-k400")
+        >>> # prepare video for the model
+        >>> inputs = image_processor(list(video), return_tensors="pt")
+        >>> # forward pass
+        >>> outputs = model(**inputs)
+        >>> last_hidden_states = outputs.last_hidden_state
+        >>> list(last_hidden_states.shape)
+        [1, 1569, 768]
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        embedding_output = self.embeddings(pixel_values)
+        encoder_outputs = self.encoder(
+            embedding_output,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+        if self.layernorm is not None:
+            sequence_output = self.layernorm(sequence_output)
+        if not return_dict:
+            return (sequence_output,) + encoder_outputs[1:]
+        return BaseModelOutput(
+            last_hidden_state=sequence_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+class MyTimesformerForVideoClassification(TimesformerForVideoClassification):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.timesformer = MyTimesformerModel(config)
+        # Classifier head
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels) if config.num_labels > 0 else nn.Identity()
+        # Initialize weights and apply final processing
+        self.post_init()
+    @add_start_docstrings_to_model_forward(TIMESFORMER_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=ImageClassifierOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, ImageClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        Returns:
+        Examples:
+        ```python
+        >>> import av
+        >>> import torch
+        >>> import numpy as np
+        >>> from transformers import AutoImageProcessor, TimesformerForVideoClassification
+        >>> from huggingface_hub import hf_hub_download
+        >>> np.random.seed(0)
+        >>> def read_video_pyav(container, indices):
+        ...     '''
+        ...     Decode the video with PyAV decoder.
+        ...     Args:
+        ...         container (`av.container.input.InputContainer`): PyAV container.
+        ...         indices (`List[int]`): List of frame indices to decode.
+        ...     Returns:
+        ...         result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
+        ...     '''
+        ...     frames = []
+        ...     container.seek(0)
+        ...     start_index = indices[0]
+        ...     end_index = indices[-1]
+        ...     for i, frame in enumerate(container.decode(video=0)):
+        ...         if i > end_index:
+        ...             break
+        ...         if i >= start_index and i in indices:
+        ...             frames.append(frame)
+        ...     return np.stack([x.to_ndarray(format="rgb24") for x in frames])
+        >>> def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
+        ...     '''
+        ...     Sample a given number of frame indices from the video.
+        ...     Args:
+        ...         clip_len (`int`): Total number of frames to sample.
+        ...         frame_sample_rate (`int`): Sample every n-th frame.
+        ...         seg_len (`int`): Maximum allowed index of sample's last frame.
+        ...     Returns:
+        ...         indices (`List[int]`): List of sampled frame indices
+        ...     '''
+        ...     converted_len = int(clip_len * frame_sample_rate)
+        ...     end_idx = np.random.randint(converted_len, seg_len)
+        ...     start_idx = end_idx - converted_len
+        ...     indices = np.linspace(start_idx, end_idx, num=clip_len)
+        ...     indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
+        ...     return indices
+        >>> # video clip consists of 300 frames (10 seconds at 30 FPS)
+        >>> file_path = hf_hub_download(
+        ...     repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
+        ... )
+        >>> container = av.open(file_path)
+        >>> # sample 8 frames
+        >>> indices = sample_frame_indices(clip_len=8, frame_sample_rate=1, seg_len=container.streams.video[0].frames)
+        >>> video = read_video_pyav(container, indices)
+        >>> image_processor = AutoImageProcessor.from_pretrained("MCG-NJU/videomae-base-finetuned-kinetics")
+        >>> model = TimesformerForVideoClassification.from_pretrained("facebook/timesformer-base-finetuned-k400")
+        >>> inputs = image_processor(list(video), return_tensors="pt")
+        >>> with torch.no_grad():
+        ...     outputs = model(**inputs)
+        ...     logits = outputs.logits
+        >>> # model predicts one of the 400 Kinetics-400 classes
+        >>> predicted_label = logits.argmax(-1).item()
+        >>> print(model.config.id2label[predicted_label])
+        eating spaghetti
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.timesformer(
+            pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0][:, 0]
+        logits = self.classifier(sequence_output)
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+        return ImageClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+from transformers import AutoImageProcessor
+MODEL_CKPT = "JackWong0911/timesformer-base-finetuned-k400-kinetic400-subset-epoch6real-num_frame_10_myViT2_more_data"
 DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+MODEL = MyTimesformerForVideoClassification.from_pretrained(MODEL_CKPT).to(DEVICE)
+PROCESSOR = AutoImageProcessor.from_pretrained("MCG-NJU/videomae-base-finetuned-kinetics")
 RESIZE_TO = PROCESSOR.size["shortest_edge"]
 NUM_FRAMES_TO_SAMPLE = MODEL.config.num_frames
     inputs=gr.Video(type="file"),
     outputs=gr.Label(num_top_classes=3),
     examples=[
+        ["examples/archery.mp4"],
+        ["examples/bowling.mp4"],
+        ["examples/flying_kite.mp4"],
+        ["examples/high_jump.mp4"],
+        ["examples/marching.mp4"],
     ],
+    title="MyViT fine-tuned on a subset of Kinetics400",
     description=(
+        "Gradio demo for MyViT for video classification. To use it, simply upload your video or click one of the"
         " examples to load them. Read more at the links below."
     ),
     article=(
+        "<div style='text-align: center;'><a href='https://huggingface.co/docs/transformers/model_doc/videomae' target='_blank'>MyViT</a>"
         " <center><a href='https://huggingface.co/sayakpaul/videomae-base-finetuned-kinetics-finetuned-ucf101-subset' target='_blank'>Fine-tuned Model</a></center></div>"
     ),
     allow_flagging=False,