Spaces:

hvaldez
/

SViTT-Ego_Action_Recognition

Sleeping

App Files Files Community

hvaldez commited on Mar 11

Commit

c18a21e

•

1 Parent(s): 8d42677

first commit

Browse files

Files changed (43) hide show

.gitattributes +9 -0
app.py +56 -0
ckpt/svitt-ego.pth +3 -0
configs/base.yml +21 -0
configs/charades_ego/action-recognition.yaml +37 -0
configs/charades_ego/svitt.yml +14 -0
configs/config_bert.json +21 -0
data/charades_ego/Charades_v1_classes.txt +157 -0
data/charades_ego/csv/0.csv +2 -0
data/charades_ego/csv/1.csv +2 -0
data/charades_ego/csv/2.csv +2 -0
data/charades_ego/csv/3.csv +2 -0
data/charades_ego/csv/4.csv +2 -0
data/charades_ego/csv/5.csv +2 -0
data/charades_ego/csv/6.csv +2 -0
data/charades_ego/csv/7.csv +2 -0
data/charades_ego/csv/8.csv +2 -0
data/charades_ego/csv/9.csv +2 -0
data/charades_ego/video/15AKPEGO.mp4 +3 -0
data/charades_ego/video/184EHEGO.mp4 +3 -0
data/charades_ego/video/6D5DHEGO.mp4 +0 -0
data/charades_ego/video/CC0LBEGO.mp4 +3 -0
data/charades_ego/video/FLY2FEGO.mp4 +3 -0
data/charades_ego/video/P9SOAEGO.mp4 +3 -0
data/charades_ego/video/PRODQEGO.mp4 +3 -0
data/charades_ego/video/QLXEXEGO.mp4 +3 -0
data/charades_ego/video/S8YZIEGO.mp4 +3 -0
data/charades_ego/video/X2JTKEGO.mp4 +3 -0
demo.py +226 -0
meta/charades_ego/label_map.json +1 -0
requirements.txt +13 -0
svitt/config.py +37 -0
svitt/datasets.py +526 -0
svitt/evaluation.py +36 -0
svitt/evaluation_charades.py +56 -0
svitt/model.py +340 -0
svitt/preprocess.py +86 -0
svitt/sparse_config.py +351 -0
svitt/sparse_xbeit.py +1585 -0
svitt/sparse_xbert.py +2039 -0
svitt/tokenization_bert.py +546 -0
svitt/utils.py +235 -0
svitt/video_transforms.py +186 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,12 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+data/charades_ego/video/15AKPEGO.mp4 filter=lfs diff=lfs merge=lfs -text
+data/charades_ego/video/184EHEGO.mp4 filter=lfs diff=lfs merge=lfs -text
+data/charades_ego/video/CC0LBEGO.mp4 filter=lfs diff=lfs merge=lfs -text
+data/charades_ego/video/FLY2FEGO.mp4 filter=lfs diff=lfs merge=lfs -text
+data/charades_ego/video/P9SOAEGO.mp4 filter=lfs diff=lfs merge=lfs -text
+data/charades_ego/video/PRODQEGO.mp4 filter=lfs diff=lfs merge=lfs -text
+data/charades_ego/video/QLXEXEGO.mp4 filter=lfs diff=lfs merge=lfs -text
+data/charades_ego/video/S8YZIEGO.mp4 filter=lfs diff=lfs merge=lfs -text
+data/charades_ego/video/X2JTKEGO.mp4 filter=lfs diff=lfs merge=lfs -text

app.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import gradio as gr
+from demo import VideoCLSModel
+sample_videos = [
+   'data/charades_ego/video/P9SOAEGO.mp4',
+   'data/charades_ego/video/6D5DHEGO.mp4',
+   'data/charades_ego/video/15AKPEGO.mp4',
+   'data/charades_ego/video/X2JTKEGO.mp4',
+   'data/charades_ego/video/184EHEGO.mp4',
+   'data/charades_ego/video/S8YZIEGO.mp4',
+   'data/charades_ego/video/PRODQEGO.mp4',
+   'data/charades_ego/video/QLXEXEGO.mp4',
+   'data/charades_ego/video/CC0LBEGO.mp4',
+   'data/charades_ego/video/FLY2FEGO.mp4'
+]
+def main():
+    svitt = VideoCLSModel("configs/charades_ego/svitt.yml")
+    def predict(video_str):
+        video_file = video_str.split('/')[-1]
+        for i, item in enumerate(sample_videos):
+            if video_file in item:
+                idx = i
+                break
+        ft_action, gt_action = svitt.predict(idx)
+        return gt_action, ft_action
+    with gr.Blocks() as demo:
+        gr.Markdown(
+            """
+            # SViTT-Ego for Action Recognition
+            Choose a sample video and click predict to view the results.
+            """
+        )
+        with gr.Row():
+            idx = gr.Number(label="Idx", visible=False)
+            video = gr.Video(label='video', format='mp4', autoplay=True, height=256, width=256)
+            with gr.Row():
+                label = gr.Text(label="Ground Truth")
+                ours = gr.Text(label="SViTT-Ego prediction")
+        with gr.Row():
+            btn = gr.Button("Predict", variant="primary")
+        btn.click(predict, inputs=[video], outputs=[label, ours])
+        with gr.Column():
+            gr.Examples(examples=[[x] for _, x in enumerate(sample_videos)], inputs=[video])
+    demo.launch()
+if __name__ == "__main__":
+    main()

ckpt/svitt-ego.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:73d9612778da3471372bc46a9e10fd6c1ed66dd2c7a715bf34472d795bd0bf58
+size 2500516566

configs/base.yml ADDED Viewed

	@@ -0,0 +1,21 @@

+model:
+    pretrain: ""
+    resume: ""
+    timesformer_freeze_space: false
+    drop_path_rate: 0.1
+    dropout_ratio: 0.5
+    freeze_vis_backbone: false
+    freeze_txt_backbone: false
+    use_vn_classifier: false
+data:
+    dataset: ek100_mir
+    root: datasets/EK100/video_ht256px
+    metadata: datasets/EK100/epic-kitchens-100-annotations/retrieval_annotations/EPIC_100_retrieval_train.csv
+    metadata_val: datasets/EK100/epic-kitchens-100-annotations/retrieval_annotations/EPIC_100_retrieval_test.csv
+    relevancy_path: datasets/EK100/epic-kitchens-100-annotations/retrieval_annotations/relevancy/caption_relevancy_EPIC_100_retrieval_test.pkl
+    clip_length: 16
+    clip_stride: 4
+    sparse_sample: false
+    num_crops: 1
+    num_clips: 1

configs/charades_ego/action-recognition.yaml ADDED Viewed

	@@ -0,0 +1,37 @@

+text_encoder: bert-base-uncased
+bert_config: configs/config_bert.json
+vit_type: beit  # items in ${vit_zoo}
+vit_zoo:  # from huggingface
+  beit: microsoft/beit-base-patch16-224-pt22k-ft22k
+vit_name_or_pretrained_path: ${vit_zoo[${vit_type}]}
+vision_encoder_args:
+  token_keep_rate: 0.7
+  token_keep_strategy: cls_attn
+  token_drop_loc: [3, 6, 9]
+  sparse_local_attn: 1
+  sparse_random_attn: 5
+  attn_block_size: 56
+image_res: 224
+embed_dim: 256
+video_input:
+  num_frames: 4
+  reader: decord  # one of [decord, av]
+  sample_type: rand
+  num_frames_test: 16  # num_frames during inference/test
+  sample_type_test: middle
+max_txt_l:
+  image: 32
+  video: 32
+batch_size:
+  image: 8
+  video: 8
+batch_size_test:
+  image: 8
+  video: 8
+k_test: 128
+temp: 0.18
+mlm_prob: 0.5

configs/charades_ego/svitt.yml ADDED Viewed

	@@ -0,0 +1,14 @@

+model:
+    pretrain: ckpt/svitt-ego.pth
+    freeze_vis_backbone: true
+    freeze_txt_backbone: true
+    num_frames: 16
+    config: configs/charades_ego/action-recognition.yaml
+data:
+    dataset: charades_ego
+    root: data/charades_ego/video
+    metadata_val: data/charades_ego/csv/{}.csv
+    label_map: meta/charades_ego/charades_ego.json
+    clip_length: 16
+    sparse_sample: true

configs/config_bert.json ADDED Viewed

	@@ -0,0 +1,21 @@

+{
+  "architectures": [
+    "BertForMaskedLM"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "bert",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 0,
+  "type_vocab_size": 2,
+  "vocab_size": 30522,
+  "fusion_layer": 9,
+  "encoder_width": 768
+}

data/charades_ego/Charades_v1_classes.txt ADDED Viewed

	@@ -0,0 +1,157 @@

+c000 Holding some clothes
+c001 Putting clothes somewhere
+c002 Taking some clothes from somewhere
+c003 Throwing clothes somewhere
+c004 Tidying some clothes
+c005 Washing some clothes
+c006 Closing a door
+c007 Fixing a door
+c008 Opening a door
+c009 Putting something on a table
+c010 Sitting on a table
+c011 Sitting at a table
+c012 Tidying up a table
+c013 Washing a table
+c014 Working at a table
+c015 Holding a phone/camera
+c016 Playing with a phone/camera
+c017 Putting a phone/camera somewhere
+c018 Taking a phone/camera from somewhere
+c019 Talking on a phone/camera
+c020 Holding a bag
+c021 Opening a bag
+c022 Putting a bag somewhere
+c023 Taking a bag from somewhere
+c024 Throwing a bag somewhere
+c025 Closing a book
+c026 Holding a book
+c027 Opening a book
+c028 Putting a book somewhere
+c029 Smiling at a book
+c030 Taking a book from somewhere
+c031 Throwing a book somewhere
+c032 Watching/Reading/Looking at a book
+c033 Holding a towel/s
+c034 Putting a towel/s somewhere
+c035 Taking a towel/s from somewhere
+c036 Throwing a towel/s somewhere
+c037 Tidying up a towel/s
+c038 Washing something with a towel
+c039 Closing a box
+c040 Holding a box
+c041 Opening a box
+c042 Putting a box somewhere
+c043 Taking a box from somewhere
+c044 Taking something from a box
+c045 Throwing a box somewhere
+c046 Closing a laptop
+c047 Holding a laptop
+c048 Opening a laptop
+c049 Putting a laptop somewhere
+c050 Taking a laptop from somewhere
+c051 Watching a laptop or something on a laptop
+c052 Working/Playing on a laptop
+c053 Holding a shoe/shoes
+c054 Putting shoes somewhere
+c055 Putting on shoe/shoes
+c056 Taking shoes from somewhere
+c057 Taking off some shoes
+c058 Throwing shoes somewhere
+c059 Sitting in a chair
+c060 Standing on a chair
+c061 Holding some food
+c062 Putting some food somewhere
+c063 Taking food from somewhere
+c064 Throwing food somewhere
+c065 Eating a sandwich
+c066 Making a sandwich
+c067 Holding a sandwich
+c068 Putting a sandwich somewhere
+c069 Taking a sandwich from somewhere
+c070 Holding a blanket
+c071 Putting a blanket somewhere
+c072 Snuggling with a blanket
+c073 Taking a blanket from somewhere
+c074 Throwing a blanket somewhere
+c075 Tidying up a blanket/s
+c076 Holding a pillow
+c077 Putting a pillow somewhere
+c078 Snuggling with a pillow
+c079 Taking a pillow from somewhere
+c080 Throwing a pillow somewhere
+c081 Putting something on a shelf
+c082 Tidying a shelf or something on a shelf
+c083 Reaching for and grabbing a picture
+c084 Holding a picture
+c085 Laughing at a picture
+c086 Putting a picture somewhere
+c087 Taking a picture of something
+c088 Watching/looking at a picture
+c089 Closing a window
+c090 Opening a window
+c091 Washing a window
+c092 Watching/Looking outside of a window
+c093 Holding a mirror
+c094 Smiling in a mirror
+c095 Washing a mirror
+c096 Watching something/someone/themselves in a mirror
+c097 Walking through a doorway
+c098 Holding a broom
+c099 Putting a broom somewhere
+c100 Taking a broom from somewhere
+c101 Throwing a broom somewhere
+c102 Tidying up with a broom
+c103 Fixing a light
+c104 Turning on a light
+c105 Turning off a light
+c106 Drinking from a cup/glass/bottle
+c107 Holding a cup/glass/bottle of something
+c108 Pouring something into a cup/glass/bottle
+c109 Putting a cup/glass/bottle somewhere
+c110 Taking a cup/glass/bottle from somewhere
+c111 Washing a cup/glass/bottle
+c112 Closing a closet/cabinet
+c113 Opening a closet/cabinet
+c114 Tidying up a closet/cabinet
+c115 Someone is holding a paper/notebook
+c116 Putting their paper/notebook somewhere
+c117 Taking paper/notebook from somewhere
+c118 Holding a dish
+c119 Putting a dish/es somewhere
+c120 Taking a dish/es from somewhere
+c121 Wash a dish/dishes
+c122 Lying on a sofa/couch
+c123 Sitting on sofa/couch
+c124 Lying on the floor
+c125 Sitting on the floor
+c126 Throwing something on the floor
+c127 Tidying something on the floor
+c128 Holding some medicine
+c129 Taking/consuming some medicine
+c130 Putting groceries somewhere
+c131 Laughing at television
+c132 Watching television
+c133 Someone is awakening in bed
+c134 Lying on a bed
+c135 Sitting in a bed
+c136 Fixing a vacuum
+c137 Holding a vacuum
+c138 Taking a vacuum from somewhere
+c139 Washing their hands
+c140 Fixing a doorknob
+c141 Grasping onto a doorknob
+c142 Closing a refrigerator
+c143 Opening a refrigerator
+c144 Fixing their hair
+c145 Working on paper/notebook
+c146 Someone is awakening somewhere
+c147 Someone is cooking something
+c148 Someone is dressing
+c149 Someone is laughing
+c150 Someone is running somewhere
+c151 Someone is going from standing to sitting
+c152 Someone is smiling
+c153 Someone is sneezing
+c154 Someone is standing up from somewhere
+c155 Someone is undressing
+c156 Someone is eating something

data/charades_ego/csv/0.csv ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ id,subject,scene,quality,relevance,verified,script,objects,descriptions,actions,length,egocentric,charades_video
2	+ P9SOAEGO,Z241,Stairs,6.0,6.0,Yes,"A person holding a broom walks up and down the stairs, brushing nonchalantly on each step. They put the broom down and pull out a vial of medicine from their pocket, then toss it down the stairs.",broom;floor;medicine;stairs,,c099 0.00 23.21;c127 1.20 23.21;c102 0.00 23.21;c100 0.00 23.21;c098 0.00 23.21;c101 25.20 23.21,33.33,Yes,LY2GQ

data/charades_ego/csv/1.csv ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ id,subject,scene,quality,relevance,verified,script,objects,descriptions,actions,length,egocentric,charades_video
2	+ 6D5DHEGO,UTMU,Kitchen,5.0,5.0,Yes,"The person put a pot on the stove and then turned to the counter to grab a cup. The person drank from the cup for a little. The person then poured the rest of the contents of the drink into the sink. The person then went to leave the room, grabbing the doorknob.",food;glass;sink;stove,,c109 20.30 21.38;c107 7.70 17.90;c147 0.00 8.70;c110 7.50 13.60,34.5,Yes,

data/charades_ego/csv/2.csv ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ id,subject,scene,quality,relevance,verified,script,objects,descriptions,actions,length,egocentric,charades_video
2	+ 15AKPEGO,I2IV,Home Office / Study (A room in a house used for work),7.0,7.0,Yes,A person is putting a box on the shelf and then closing the cabinet.,box;cabinet;desk;door;shelf,,c043 4.00 10.30;c040 5.30 19.80;c112 19.10 26.70;c042 14.10 20.70;c081 14.10 20.70;c114 15.00 20.70;c006 18.80 28.70,30.83,Yes,IA5TC

data/charades_ego/csv/3.csv ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ id,subject,scene,quality,relevance,verified,script,objects,descriptions,actions,length,egocentric,charades_video
2	+ X2JTKEGO,EV0Z,Recreation room / Man cave,6.0,5.0,Yes,"Person is watching television while another person is walking towards the shelf, grasping the camera.",bed;camera;drawer;television,,c135 0.00 30.30;c132 0.00 30.50;c016 7.70 29.80;c015 0.00 25.50,31.75,Yes,D3PPI

data/charades_ego/csv/4.csv ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ id,subject,scene,quality,relevance,verified,script,objects,descriptions,actions,length,egocentric,charades_video
2	+ 184EHEGO,KFGP,Home Office / Study (A room in a house used for work),6.0,6.0,Yes,A person is eating at a table while they play on a laptop.,chair;food;laptop;table,,c011 1.20 18.00;c059 0.20 17.60;c052 2.10 23.62;c156 0.40 23.62;c010 15.70 23.62;c014 14.40 23.62,37.17,Yes,CUB69

data/charades_ego/csv/5.csv ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ id,subject,scene,quality,relevance,verified,script,objects,descriptions,actions,length,egocentric,charades_video
2	+ S8YZIEGO,I2IV,Kitchen,6.0,7.0,Yes,"A person is cooking food on the stove. The person takes a picture of the food with their phone, then puts the phone back into their pocket.",food;phone/camera;pot;stirring utensil;stove,,c147 0.00 24.67;c017 24.50 24.67;c087 16.50 23.50;c015 8.60 24.67;c016 8.10 24.67;c018 5.80 13.60;c147 0.00 10.80;c017 27.10 24.67;c154 0.00 24.67,34.62,Yes,DUEEE

data/charades_ego/csv/6.csv ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ id,subject,scene,quality,relevance,verified,script,objects,descriptions,actions,length,egocentric,charades_video
2	+ PRODQEGO,DJ17,Kitchen,6.0,7.0,Yes,A person is cooking at a stove then they begin to play with some food that's on the counter.,dish;food;shelf;stove;table,,c009 20.90 30.00;c064 22.50 29.90;c063 16.50 23.10;c147 0.00 18.60;c061 19.10 31.79,32.17,Yes,Y5826

data/charades_ego/csv/7.csv ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ id,subject,scene,quality,relevance,verified,script,objects,descriptions,actions,length,egocentric,charades_video
2	+ QLXEXEGO,4OHY,Hallway,6.0,5.0,Yes,"A person smiles as they fix the door. The person laughs hysterically, then throws their bag of tools across the room.",door,,c006 7.50 22.20;c008 32.30 29.71;c152 17.40 26.00;c007 0.00 27.40,40.33,Yes,K3T1B

data/charades_ego/csv/8.csv ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ id,subject,scene,quality,relevance,verified,script,objects,descriptions,actions,length,egocentric,charades_video
2	+ CC0LBEGO,3VLX,Recreation room / Man cave,6.0,7.0,Yes,The person is fixing a shelf in the Recreation room / Man cave. Once the person was finished the person grabbed a glass of water and took a few sips. The person then looked around and left the room.,dirt;glass;shelf;water,,c110 0.00 32.38;c110 22.00 26.90;c106 22.30 26.70;c107 20.90 26.30;c082 1.40 11.40,36.29,Yes,

data/charades_ego/csv/9.csv ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ id,subject,scene,quality,relevance,verified,script,objects,descriptions,actions,length,egocentric,charades_video
2	+ FLY2FEGO,VT5W,Kitchen,7.0,7.0,Yes,A person opens a cabinet and takes out some coffee and then closes the cabinet.,closet/cabinet;coffee,,c113 0.00 4.20;c112 3.90 9.90,31.58,Yes,EJJIO

data/charades_ego/video/15AKPEGO.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1dbe66fb96be257ae24c122a8f534e893883aa058872932ea630d20e9a609a1f
+size 1150119

data/charades_ego/video/184EHEGO.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:115fea186961bddc3bc8e9f10de4472a450720b143361f598257308d64bfd1b9
+size 1459016

data/charades_ego/video/6D5DHEGO.mp4 ADDED Viewed

Binary file (871 kB). View file

data/charades_ego/video/CC0LBEGO.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b80523915026d3e23740b4eb057362809e6375e973e0b2fad42a5f11e4331629
+size 1414227

data/charades_ego/video/FLY2FEGO.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b45b12ffeeeac6ae74955c37fe57bc468d309aaae188f9020cde88c3f7dd68da
+size 1215686

data/charades_ego/video/P9SOAEGO.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dfc22189ed6d228eca382474ff668f52d0c2b034204241837a4ea00c6b650fd5
+size 2497723

data/charades_ego/video/PRODQEGO.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5cd034ab3146bbb36264f5ddbf7b451d09e3e1002c4550e565805710e8d5a1cd
+size 1318941

data/charades_ego/video/QLXEXEGO.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:29cc6ec737fe810378ba1f91363078194cf73a4e8e9dcafb45013e04e426bb94
+size 1834803

data/charades_ego/video/S8YZIEGO.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d25f66f11549b8dac6a05c0f9405f1dc3a77df68a56430b76d33ab830300fd04
+size 1439584

data/charades_ego/video/X2JTKEGO.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:617b7e321a96271f9603869999a7dc4ac26bf2e8d5f43ab8758428731373de6d
+size 1976273

demo.py ADDED Viewed

	@@ -0,0 +1,226 @@

+### demo.py
+# Define model classes for inference.
+###
+import json
+import numpy as np
+import os
+import pandas as pd
+import torch
+import torch.nn as nn
+import torch.backends.cudnn as cudnn
+import torchvision.transforms as transforms
+import torchvision.transforms._transforms_video as transforms_video
+from sklearn.metrics import confusion_matrix
+from einops import rearrange
+from transformers import BertTokenizer
+from svitt.model import SViTT
+from svitt.datasets import VideoClassyDataset
+from svitt.video_transforms import Permute
+from svitt.config import load_cfg, setup_config
+from svitt.evaluation_charades import charades_map
+from svitt.evaluation import get_mean_accuracy
+class VideoModel(nn.Module):
+    """ Base model for video understanding based on SViTT architecture. """
+    def __init__(self, config):
+        """ Initializes the model.
+        Parameters:
+            config: config file
+        """
+        super(VideoModel, self).__init__()
+        self.cfg = load_cfg(config)
+        self.model = self.build_model()
+        self.templates = ['{}']
+        self.dataset = self.cfg['data']['dataset']
+        self.eval()
+    def build_model(self):
+        cfg = self.cfg
+        if cfg['model'].get('pretrain', False):
+            ckpt_path = cfg['model']['pretrain']
+        else:
+            raise Exception('no checkpoint found')
+        if cfg['model'].get('config', False):
+            config_path = cfg['model']['config']
+        else:
+            raise Exception('no model config found')
+        self.model_cfg = setup_config(config_path)
+        self.tokenizer = BertTokenizer.from_pretrained(self.model_cfg.text_encoder)
+        model = SViTT(config=self.model_cfg, tokenizer=self.tokenizer)
+        print(f"Loading checkpoint from {ckpt_path}")
+        checkpoint = torch.load(ckpt_path, map_location="cpu")
+        state_dict = checkpoint["model"]
+        # fix for zero-shot evaluation
+        for key in list(state_dict.keys()):
+            if "bert" in key:
+                encoder_key = key.replace("bert.", "")
+                state_dict[encoder_key] = state_dict[key]
+        if torch.cuda.is_available():
+            model.cuda()
+        model.load_state_dict(state_dict, strict=False)
+        return model
+    def eval(self):
+        cudnn.benchmark = True
+        for p in self.model.parameters():
+            p.requires_grad = False
+        self.model.eval()
+class VideoCLSModel(VideoModel):
+    """ Video model for video classification tasks (Charades-Ego, EGTEA). """
+    def __init__(self, config):
+        super(VideoCLSModel, self).__init__(config)
+        self.labels, self.mapping_vn2act = self.gen_label_map()
+        self.text_features = self.get_text_features()
+    def gen_label_map(self):
+        labelmap = self.cfg.get('label_map', 'meta/charades_ego/label_map.json')
+        if os.path.isfile(labelmap):
+            print(f"=> Loading label maps from {labelmap}")
+            meta = json.load(open(labelmap, 'r'))
+            labels, mapping_vn2act = meta['labels'], meta['mapping_vn2act']
+        else:
+            from svitt.preprocess import generate_label_map
+            labels, mapping_vn2act = generate_label_map(self.dataset)
+            meta = {'labels': labels, 'mapping_vn2act': mapping_vn2act}
+            meta_dir = f'meta/{self.dataset}'
+            if not os.path.exists(meta_dir):
+                os.makedirs(meta_dir)
+            json.dump(meta, open(f'{meta_dir}/label_map.json', 'w'))
+            print(f"=> Label map is generated and saved to {meta_dir}/label_map.json")
+        return labels, mapping_vn2act
+    def load_data(self, idx=None):
+        print(f"=> Creating dataset")
+        cfg, dataset = self.cfg, self.dataset
+        data_cfg = cfg['data']
+        crop_size = 224
+        val_transform = transforms.Compose([
+            Permute([3, 0, 1, 2]),    # T H W C -> C T H W
+            transforms.Resize(crop_size),
+            transforms.CenterCrop(crop_size),
+            transforms_video.NormalizeVideo(
+                mean=[108.3272985, 116.7460125, 104.09373615000001],
+                std=[68.5005327, 66.6321579, 70.32316305],
+            ),
+        ])
+        if idx is None:
+            metadata_val = data_cfg['metadata_val']
+        else:
+            metadata_val = data_cfg['metadata_val'].format(idx)
+        if dataset in ['charades_ego', 'egtea']:
+            val_dataset = VideoClassyDataset(
+                dataset,
+                data_cfg['root'],
+                metadata_val,
+                transform=val_transform,
+                is_training=False,
+                label_mapping=self.mapping_vn2act,
+                is_trimmed=False,
+                num_clips=1,
+                clip_length=data_cfg['clip_length'],
+                clip_stride=data_cfg['clip_stride'],
+                sparse_sample=data_cfg['sparse_sample'],
+            )
+        else:
+            raise NotImplementedError
+        val_loader = torch.utils.data.DataLoader(
+            val_dataset, batch_size=8, shuffle=False,
+            num_workers=4, pin_memory=True, sampler=None, drop_last=False
+        )
+        return val_loader
+    @torch.no_grad()
+    def get_text_features(self):
+        print('=> Extracting text features')
+        embeddings = self.tokenizer(
+            self.labels,
+            padding="max_length",
+            truncation=True,
+            max_length=self.model_cfg.max_txt_l.video,
+            return_tensors="pt",
+        )
+        _, class_embeddings = self.model.encode_text(embeddings)
+        return class_embeddings
+    @torch.no_grad()
+    def forward(self, idx=None):
+        print('=> Start forwarding')
+        val_loader = self.load_data(idx)
+        all_outputs = []
+        all_targets = []
+        for i, values in enumerate(val_loader):
+            images = values[0]
+            target = values[1]
+            if torch.cuda.is_available():
+                images = images.cuda(non_blocking=True)
+                target = target.cuda(non_blocking=True)
+            # encode images
+            images = rearrange(images, 'b c k h w ->  b k c h w')
+            dims = images.shape
+            images = images.reshape(-1, 4, dims[-3], dims[-2], dims[-1])
+            image_features, _ = self.model.encode_image(images)
+            if image_features.ndim == 3:
+                image_features = rearrange(image_features, '(b k) n d -> b (k n) d', b=1)
+            else:
+                image_features = rearrange(image_features, '(b k) d -> b k d', b=1)
+            # cosine similarity as logits
+            similarity = self.model.get_sim(image_features, self.text_features)[0]
+            all_outputs.append(similarity.cpu())
+            all_targets.append(target.cpu())
+        all_outputs = torch.cat(all_outputs)
+        all_targets = torch.cat(all_targets)
+        return all_outputs, all_targets
+    @torch.no_grad()
+    def predict(self, idx=0):
+        all_outputs, all_targets = self.forward(idx)
+        preds, targets = all_outputs.numpy(), all_targets.numpy()
+        #sel = np.where(np.cumsum(sorted(preds[0].tolist(), reverse=True)) > 0.06)[0][0]
+        sel = 5
+        df = pd.DataFrame(self.labels)
+        pred_action = df.iloc[preds[0].argsort()[-sel:]].values.tolist()
+        gt_action = df.iloc[np.where(targets[0])[0]].values.tolist()
+        pred_action = sorted([x[0] for x in pred_action])
+        gt_action = sorted([x[0] for x in gt_action])
+        return pred_action, gt_action
+    @torch.no_grad()
+    def evaluate(self):
+        all_outputs, all_targets = self.forward()
+        preds, targets = all_outputs.numpy(), all_targets.numpy()
+        if self.dataset == 'charades_ego':
+            m_ap, _, m_aps = charades_map(preds, targets)
+            print('mAP = {:.3f}'.format(m_ap))
+        elif self.dataset == 'egtea':
+            cm = confusion_matrix(targets, preds.argmax(axis=1))
+            mean_class_acc, acc = get_mean_accuracy(cm)
+            print('Mean Acc. = {:.3f}, Top-1 Acc. = {:.3f}'.format(mean_class_acc, acc))
+        else:
+            raise NotImplementedError

meta/charades_ego/label_map.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"labels": ["Holding some clothes", "Putting clothes somewhere", "Taking some clothes from somewhere", "Throwing clothes somewhere", "Tidying some clothes", "Washing some clothes", "Closing a door", "Fixing a door", "Opening a door", "Putting something on a table", "Sitting on a table", "Sitting at a table", "Tidying up a table", "Washing a table", "Working at a table", "Holding a phone/camera", "Playing with a phone/camera", "Putting a phone/camera somewhere", "Taking a phone/camera from somewhere", "Talking on a phone/camera", "Holding a bag", "Opening a bag", "Putting a bag somewhere", "Taking a bag from somewhere", "Throwing a bag somewhere", "Closing a book", "Holding a book", "Opening a book", "Putting a book somewhere", "Smiling at a book", "Taking a book from somewhere", "Throwing a book somewhere", "Watching/Reading/Looking at a book", "Holding a towel/s", "Putting a towel/s somewhere", "Taking a towel/s from somewhere", "Throwing a towel/s somewhere", "Tidying up a towel/s", "Washing something with a towel", "Closing a box", "Holding a box", "Opening a box", "Putting a box somewhere", "Taking a box from somewhere", "Taking something from a box", "Throwing a box somewhere", "Closing a laptop", "Holding a laptop", "Opening a laptop", "Putting a laptop somewhere", "Taking a laptop from somewhere", "Watching a laptop or something on a laptop", "Working/Playing on a laptop", "Holding a shoe/shoes", "Putting shoes somewhere", "Putting on shoe/shoes", "Taking shoes from somewhere", "Taking off some shoes", "Throwing shoes somewhere", "Sitting in a chair", "Standing on a chair", "Holding some food", "Putting some food somewhere", "Taking food from somewhere", "Throwing food somewhere", "Eating a sandwich", "Making a sandwich", "Holding a sandwich", "Putting a sandwich somewhere", "Taking a sandwich from somewhere", "Holding a blanket", "Putting a blanket somewhere", "Snuggling with a blanket", "Taking a blanket from somewhere", "Throwing a blanket somewhere", "Tidying up a blanket/s", "Holding a pillow", "Putting a pillow somewhere", "Snuggling with a pillow", "Taking a pillow from somewhere", "Throwing a pillow somewhere", "Putting something on a shelf", "Tidying a shelf or something on a shelf", "Reaching for and grabbing a picture", "Holding a picture", "Laughing at a picture", "Putting a picture somewhere", "Taking a picture of something", "Watching/looking at a picture", "Closing a window", "Opening a window", "Washing a window", "Watching/Looking outside of a window", "Holding a mirror", "Smiling in a mirror", "Washing a mirror", "Watching something/someone/themselves in a mirror", "Walking through a doorway", "Holding a broom", "Putting a broom somewhere", "Taking a broom from somewhere", "Throwing a broom somewhere", "Tidying up with a broom", "Fixing a light", "Turning on a light", "Turning off a light", "Drinking from a cup/glass/bottle", "Holding a cup/glass/bottle of something", "Pouring something into a cup/glass/bottle", "Putting a cup/glass/bottle somewhere", "Taking a cup/glass/bottle from somewhere", "Washing a cup/glass/bottle", "Closing a closet/cabinet", "Opening a closet/cabinet", "Tidying up a closet/cabinet", "Someone is holding a paper/notebook", "Putting their paper/notebook somewhere", "Taking paper/notebook from somewhere", "Holding a dish", "Putting a dish/es somewhere", "Taking a dish/es from somewhere", "Wash a dish/dishes", "Lying on a sofa/couch", "Sitting on sofa/couch", "Lying on the floor", "Sitting on the floor", "Throwing something on the floor", "Tidying something on the floor", "Holding some medicine", "Taking/consuming some medicine", "Putting groceries somewhere", "Laughing at television", "Watching television", "Someone is awakening in bed", "Lying on a bed", "Sitting in a bed", "Fixing a vacuum", "Holding a vacuum", "Taking a vacuum from somewhere", "Washing their hands", "Fixing a doorknob", "Grasping onto a doorknob", "Closing a refrigerator", "Opening a refrigerator", "Fixing their hair", "Working on paper/notebook", "Someone is awakening somewhere", "Someone is cooking something", "Someone is dressing", "Someone is laughing", "Someone is running somewhere", "Someone is going from standing to sitting", "Someone is smiling", "Someone is sneezing", "Someone is standing up from somewhere", "Someone is undressing", "Someone is eating something"], "mapping_vn2act": {"c000": 0, "c001": 1, "c002": 2, "c003": 3, "c004": 4, "c005": 5, "c006": 6, "c007": 7, "c008": 8, "c009": 9, "c010": 10, "c011": 11, "c012": 12, "c013": 13, "c014": 14, "c015": 15, "c016": 16, "c017": 17, "c018": 18, "c019": 19, "c020": 20, "c021": 21, "c022": 22, "c023": 23, "c024": 24, "c025": 25, "c026": 26, "c027": 27, "c028": 28, "c029": 29, "c030": 30, "c031": 31, "c032": 32, "c033": 33, "c034": 34, "c035": 35, "c036": 36, "c037": 37, "c038": 38, "c039": 39, "c040": 40, "c041": 41, "c042": 42, "c043": 43, "c044": 44, "c045": 45, "c046": 46, "c047": 47, "c048": 48, "c049": 49, "c050": 50, "c051": 51, "c052": 52, "c053": 53, "c054": 54, "c055": 55, "c056": 56, "c057": 57, "c058": 58, "c059": 59, "c060": 60, "c061": 61, "c062": 62, "c063": 63, "c064": 64, "c065": 65, "c066": 66, "c067": 67, "c068": 68, "c069": 69, "c070": 70, "c071": 71, "c072": 72, "c073": 73, "c074": 74, "c075": 75, "c076": 76, "c077": 77, "c078": 78, "c079": 79, "c080": 80, "c081": 81, "c082": 82, "c083": 83, "c084": 84, "c085": 85, "c086": 86, "c087": 87, "c088": 88, "c089": 89, "c090": 90, "c091": 91, "c092": 92, "c093": 93, "c094": 94, "c095": 95, "c096": 96, "c097": 97, "c098": 98, "c099": 99, "c100": 100, "c101": 101, "c102": 102, "c103": 103, "c104": 104, "c105": 105, "c106": 106, "c107": 107, "c108": 108, "c109": 109, "c110": 110, "c111": 111, "c112": 112, "c113": 113, "c114": 114, "c115": 115, "c116": 116, "c117": 117, "c118": 118, "c119": 119, "c120": 120, "c121": 121, "c122": 122, "c123": 123, "c124": 124, "c125": 125, "c126": 126, "c127": 127, "c128": 128, "c129": 129, "c130": 130, "c131": 131, "c132": 132, "c133": 133, "c134": 134, "c135": 135, "c136": 136, "c137": 137, "c138": 138, "c139": 139, "c140": 140, "c141": 141, "c142": 142, "c143": 143, "c144": 144, "c145": 145, "c146": 146, "c147": 147, "c148": 148, "c149": 149, "c150": 150, "c151": 151, "c152": 152, "c153": 153, "c154": 154, "c155": 155, "c156": 156}}

requirements.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+gradio
+torch
+torchvision
+scikit-learn
+eva-decord
+timm
+einops
+ftfy
+regex
+transformers
+omegaconf
+zCurve
+numpy-hilbert-curve

svitt/config.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import yaml
+from omegaconf import OmegaConf, DictConfig
+def load_base_cfg():
+    with open('configs/base.yml', 'r') as fp:
+        cfg = yaml.load(fp, Loader=yaml.SafeLoader)
+    return cfg
+def load_cfg(cfg_file):
+    cfg = load_base_cfg()
+    with open(cfg_file, 'r') as fp:
+        exp_cfg = yaml.load(fp, Loader=yaml.SafeLoader)
+    cfg['model'].update(exp_cfg.get('model', {}))
+    cfg['data'].update(exp_cfg.get('data', {}))
+    dataset = cfg['data'].get('dataset')
+    return cfg
+def convert_types(config):
+    """Convert `'None'` (str) --> `None` (None). Only supports top-level"""
+    for k, v in config.items():
+        if isinstance(v, DictConfig):
+            setattr(config, k, convert_types(v))
+        # TODO convert types in ListConfig, right now they are ignored
+        # if isinstance(v, ListConfig):
+        #     new_v = ListConfig()
+        if v in ["None", "none"]:
+            setattr(config, k, None)
+    return config
+def setup_config(config_path):
+    yaml_config = OmegaConf.load(config_path)
+    config = convert_types(yaml_config)
+    return config

svitt/datasets.py ADDED Viewed

	@@ -0,0 +1,526 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import csv
+import glob
+import json
+import numpy as np
+import os.path as osp
+import pickle
+import random
+import decord
+import pandas as pd
+import torch
+def datetime2sec(str):
+    hh, mm, ss = str.split(':')
+    return int(hh) * 3600 + int(mm) * 60 + float(ss)
+def video_loader(root, vid, second, end_second=None, chunk_len=300, fps=30, clip_length=32, jitter=False):
+    if chunk_len == -1:
+        vr = decord.VideoReader(osp.join(root, '{}.mp4'.format(vid)))
+        second_offset = second
+        if end_second is not None:
+            end_second = min(end_second, len(vr) / vr.get_avg_fps())
+        else:
+            end_second = len(vr) / vr.get_avg_fps()
+    else:
+        chunk_start = int(second) // chunk_len * chunk_len
+        second_offset = second - chunk_start
+        vr = decord.VideoReader(osp.join(root, '{}.mp4'.format(vid), '{}.mp4'.format(chunk_start)))
+    if fps == -1:
+        fps = vr.get_avg_fps()
+    # calculate frame_ids
+    frame_offset = int(np.round(second_offset * fps))
+    total_duration = max(int((end_second - second) * fps), clip_length)
+    if chunk_len == -1:
+        if end_second <= second:
+            raise ValueError("end_second should be greater than second")
+        else:
+            frame_ids = get_frame_ids(frame_offset, min(frame_offset + total_duration, len(vr)), num_segments=clip_length, jitter=jitter)
+    else:
+        frame_ids = get_frame_ids(frame_offset, frame_offset + total_duration, num_segments=clip_length, jitter=jitter)
+    # load frames
+    if max(frame_ids) < len(vr):
+        try:
+            frames = vr.get_batch(frame_ids).asnumpy()
+        except decord.DECORDError as error:
+            print(error)
+            frames = vr.get_batch([0] * len(frame_ids)).asnumpy()
+    else:
+        # find the remaining frames in the next chunk
+        try:
+            frame_ids_part1 = list(filter(lambda frame_id: frame_id < len(vr), frame_ids))
+            frames_part1 = vr.get_batch(frame_ids_part1).asnumpy()
+            vr2 = decord.VideoReader(osp.join(root, '{}.mp4'.format(vid), '{}.mp4'.format(chunk_start + chunk_len)))
+            frame_ids_part2 = list(filter(lambda frame_id: frame_id >= len(vr), frame_ids))
+            frame_ids_part2 = [min(frame_id % len(vr), len(vr2) - 1) for frame_id in frame_ids_part2]
+            frames_part2 = vr2.get_batch(frame_ids_part2).asnumpy()
+            frames = np.concatenate([frames_part1, frames_part2], axis=0)
+        # the next chunk does not exist; the current chunk is the last one
+        except (RuntimeError, decord.DECORDError) as error:
+            print(error)
+            frame_ids = get_frame_ids(min(frame_offset, len(vr) - 1), len(vr), num_segments=clip_length, jitter=jitter)
+            frames = vr.get_batch(frame_ids).asnumpy()
+    frames = [torch.tensor(frame, dtype=torch.float32) for frame in frames]
+    return torch.stack(frames, dim=0)
+def get_frame_ids(start_frame, end_frame, num_segments=32, jitter=True):
+    seg_size = float(end_frame - start_frame - 1) / num_segments
+    seq = []
+    for i in range(num_segments):
+        start = int(np.round(seg_size * i) + start_frame)
+        end = int(np.round(seg_size * (i + 1)) + start_frame)
+        end = min(end, end_frame)
+        if jitter:
+            frame_id = np.random.randint(low=start, high=(end + 1))
+        else:
+            frame_id = (start + end) // 2
+        seq.append(frame_id)
+    return seq
+def video_loader_by_frames(root, vid, frame_ids):
+    vr = decord.VideoReader(osp.join(root, vid))
+    try:
+        frames = vr.get_batch(frame_ids).asnumpy()
+        frames = [torch.tensor(frame, dtype=torch.float32) for frame in frames]
+    except (IndexError, decord.DECORDError) as error:
+        print(error)
+        print("Erroneous video: ", vid)
+        frames = [torch.zeros((240, 320, 3)) for _ in range(len(frame_ids))]
+    return torch.stack(frames, dim=0)
+class VideoCaptionDatasetBase(torch.utils.data.Dataset):
+    def __init__(self, dataset, root, metadata, is_trimmed=True):
+        self.dataset = dataset
+        self.root = root
+        self.is_trimmed = is_trimmed
+        if self.dataset == 'ego4d':
+            with open(metadata, 'rb') as f:
+                self.samples = pickle.load(f)
+        elif self.dataset == 'ego4d_mcq':
+            with open(metadata, 'r') as f:
+                self.samples = json.load(f)
+        elif self.dataset in ['ek100_cls', 'ek100_mir']:
+            video_list = glob.glob(osp.join(self.root, '*/*.MP4'))
+            fps_dict = {video: decord.VideoReader(video).get_avg_fps() for video in video_list}
+            self.samples = []
+            with open(metadata) as f:
+                csv_reader = csv.reader(f)
+                _ = next(csv_reader)  # skip the header
+                for row in csv_reader:
+                    pid, vid = row[1:3]
+                    # start_frame, end_frame = int(row[6]), int(row[7])
+                    # Deprecated: some videos might have fps mismatch issue
+                    start_timestamp, end_timestamp = datetime2sec(row[4]), datetime2sec(row[5])
+                    narration = row[8]
+                    verb, noun = int(row[10]), int(row[12])
+                    vid_path = '{}/{}.MP4'.format(pid, vid)
+                    fps = fps_dict[osp.join(self.root, vid_path)]
+                    start_frame = int(np.round(fps * start_timestamp))
+                    end_frame = int(np.ceil(fps * end_timestamp))
+                    self.samples.append((vid_path, start_frame, end_frame, narration, verb, noun))
+            if self.dataset == 'ek100_mir':
+                self.metadata_sentence = pd.read_csv(metadata[:metadata.index('.csv')] + '_sentence.csv')
+                if 'train' in metadata:
+                    self.relevancy_mat = pickle.load(open(osp.join(osp.dirname(metadata), 'relevancy', 'caption_relevancy_EPIC_100_retrieval_train.pkl'), 'rb'))
+                elif 'test' in metadata:
+                    self.relevancy_mat = pickle.load(open(osp.join(osp.dirname(metadata), 'relevancy', 'caption_relevancy_EPIC_100_retrieval_test.pkl'), 'rb'))
+                else:
+                    raise ValueError('{} should contain either "train" or "test"!'.format(metadata))
+                self.relevancy = .1
+        elif self.dataset == 'egtea':
+            video_list = glob.glob(osp.join(self.root, '*/*'))
+            len_dict = {video: len(decord.VideoReader(video)) for video in video_list}
+            vn_list, labels = [], []
+            for row in open(osp.join(osp.dirname(metadata), 'action_idx.txt')):
+                row = row.strip()
+                vn = int(row.split(' ')[-1])
+                vn_list.append(vn)
+                narration = ' '.join(row.split(' ')[:-1])
+                labels.append(narration.replace('_', ' ').lower())
+                # labels.append(narration)
+            mapping_act2narration = {vn: narration for vn, narration in zip(vn_list, labels)}
+            self.samples = []
+            with open(metadata) as f:
+                for row in f:
+                    clip_id, action_idx = row.strip().split(' ')[:2]
+                    video_id = '-'.join(clip_id.split('-')[:3])
+                    vid_relpath = osp.join(video_id, '{}.mp4'.format(clip_id))
+                    vid_fullpath = osp.join(self.root, video_id, '{}.mp4'.format(clip_id))
+                    self.samples.append((vid_relpath, 0, len_dict[vid_fullpath], mapping_act2narration[int(action_idx)]))
+        elif self.dataset == 'charades_ego':
+            video_list = glob.glob(osp.join(self.root, '*.mp4'))
+            fps_dict = {video: decord.VideoReader(video).get_avg_fps() for video in video_list}
+            self.samples = []
+            with open(metadata) as f:
+                csv_reader = csv.reader(f)
+                _ = next(csv_reader)  # skip the header
+                for row in csv_reader:
+                    video_id = row[0]
+                    if self.is_trimmed:
+                        for action_tuple in row[9].split(';'):
+                            if not action_tuple:
+                                continue
+                            action, start_timestamp, end_timestamp = action_tuple.split(' ')
+                            start_timestamp, end_timestamp = float(start_timestamp), float(end_timestamp)
+                            vid_path = '{}.mp4'.format(video_id)
+                            fps = fps_dict[osp.join(self.root, vid_path)]
+                            start_frame = int(np.round(fps * start_timestamp))
+                            end_frame = int(np.ceil(fps * end_timestamp))
+                            self.samples.append((vid_path, start_frame, end_frame, action))
+                    else:
+                        if not row[9]:
+                            action_list = []
+                        else:
+                            action_list = [action_tuple.split(' ')[0] for action_tuple in row[9].split(';')]
+                        vid_path = '{}.mp4'.format(video_id)
+                        fps = fps_dict[osp.join(self.root, vid_path)]
+                        duration = fps * float(row[10])
+                        self.samples.append((vid_path, 0, duration, action_list))
+        elif self.dataset == 'charades_ego_trimmed':
+            with open(metadata, 'rb') as f:
+                self.samples = pickle.load(f)
+        else:
+            raise NotImplementedError
+    def get_raw_item(self, i, is_training=True, num_clips=1, clip_length=32, clip_stride=2, sparse_sample=False,
+                     narration_selection='random'):
+        if self.dataset == 'ego4d':
+            if len(self.samples[i]) == 4:
+                vid, start_second, end_second, narration = self.samples[i]
+                frames = video_loader(self.root, vid, start_second,
+                                      end_second=end_second,
+                                      clip_length=clip_length,
+                                      jitter=is_training)
+                if isinstance(narration, list):
+                    if narration_selection == 'random':
+                        narration = random.choice(narration)
+                    elif narration_selection == 'concat':
+                        narration = '. '.join(narration)
+                    elif narration_selection == 'list':
+                        narration = narration
+                    else:
+                        raise ValueError
+                return frames, narration
+            elif len(self.samples[i]) == 5:
+                # TODO: need better filtering strategy based on nll
+                vid, start_second, end_second, narration, _ = self.samples[i]
+                frames = video_loader(self.root, vid, start_second,
+                                      end_second=end_second,
+                                      clip_length=clip_length,
+                                      jitter=is_training)
+                if isinstance(narration, list):
+                    if narration_selection == 'random':
+                        narration = random.choice(narration)
+                    elif narration_selection == 'concat':
+                        narration = '. '.join(narration)
+                    elif narration_selection == 'list':
+                        narration = narration
+                    else:
+                        raise ValueError
+                return frames, narration
+        elif self.dataset == 'ego4d_mcq':
+            itemMCQ = self.samples[str(i)]
+            answerIndex = itemMCQ['answer']
+            textQuery = itemMCQ['query']['clip_text']
+            sampleOptions = itemMCQ['choices']
+            frames_options = []
+            narration_options = []
+            for option_id in range(len(sampleOptions)):
+                option = sampleOptions[str(option_id)]
+                frames = video_loader(self.root, option['video_uid'],
+                                      float(option['clip_start']), end_second=float(option['clip_end']),
+                                      clip_length=clip_length,
+                                      jitter=is_training)
+                frames_options.append(frames)
+                narration_options.append(option['clip_text'])
+            return textQuery, frames_options, narration_options, answerIndex, itemMCQ['types']
+        elif self.dataset == 'ek100_mir':
+            vid_path, start_frame, end_frame, narration, verb, noun = self.samples[i]
+            # from third_party.EgoVLP.base.base_dataset import sample_frames_start_end
+            # frame_ids = sample_frames_start_end(clip_length, start_frame, end_frame, sample='uniform', fix_start=None)
+            frame_ids = get_frame_ids(start_frame, end_frame, num_segments=clip_length, jitter=is_training)
+            frames = video_loader_by_frames(self.root, vid_path, frame_ids)
+            if is_training:
+                positive_list = np.where(self.relevancy_mat[i] > self.relevancy)[0].tolist()
+                if positive_list != []:
+                    pos = random.sample(positive_list, min(len(positive_list), 1))[0]
+                    if pos < len(self.metadata_sentence) and pos < self.relevancy_mat.shape[1]:
+                        return frames, (self.metadata_sentence.iloc[pos][1], self.relevancy_mat[i][pos])
+            else:
+                return frames, (narration, 1)
+        elif self.dataset == 'ek100_cls':
+            vid_path, start_frame, end_frame, narration, verb, noun = self.samples[i]
+            frame_ids = get_frame_ids(start_frame, end_frame, num_segments=clip_length, jitter=is_training)
+            frames = video_loader_by_frames(self.root, vid_path, frame_ids)
+            return frames, '{}:{}'.format(verb, noun)
+        elif self.dataset == 'egtea':
+            vid_path, start_frame, end_frame, sentence = self.samples[i]
+            if is_training:
+                assert num_clips == 1
+                if end_frame < clip_length * clip_stride:
+                    frames = video_loader_by_frames(self.root, vid_path, list(np.arange(0, end_frame)))
+                    zeros = torch.zeros((clip_length * clip_stride - end_frame, *frames.shape[1:]))
+                    frames = torch.cat((frames, zeros), dim=0)
+                    frames = frames[::clip_stride]
+                else:
+                    start_id = np.random.randint(0, end_frame - clip_length * clip_stride + 1)
+                    frame_ids = np.arange(start_id, start_id + clip_length * clip_stride, clip_stride)
+                    frames = video_loader_by_frames(self.root, vid_path, frame_ids)
+            else:
+                if end_frame < clip_length * clip_stride:
+                    frames = video_loader_by_frames(self.root, vid_path, list(np.arange(0, end_frame)))
+                    zeros = torch.zeros((clip_length * clip_stride - end_frame, *frames.shape[1:]))
+                    frames = torch.cat((frames, zeros), dim=0)
+                    frames = frames[::clip_stride]
+                    frames = frames.repeat(num_clips, 1, 1, 1)
+                else:
+                    frame_ids = []
+                    for start_id in np.linspace(0, end_frame - clip_length * clip_stride, num_clips, dtype=int):
+                        frame_ids.extend(np.arange(start_id, start_id + clip_length * clip_stride, clip_stride))
+                    frames = video_loader_by_frames(self.root, vid_path, frame_ids)
+            return frames, sentence
+        elif self.dataset == 'charades_ego':
+            vid_path, start_frame, end_frame, action_list = self.samples[i]
+            if sparse_sample:
+                frame_ids = get_frame_ids(start_frame, end_frame, num_segments=num_clips * clip_length, jitter=is_training)
+                frames = video_loader_by_frames(self.root, vid_path, frame_ids)
+            else:
+                if end_frame < clip_length * clip_stride:
+                    frames = video_loader_by_frames(self.root, vid_path, list(np.arange(0, end_frame)))
+                    zeros = torch.zeros((clip_length * clip_stride - end_frame, *frames.shape[1:]))
+                    frames = torch.cat((frames, zeros), dim=0)
+                    frames = frames[::clip_stride]
+                    frames = frames.repeat(num_clips, 1, 1, 1)
+                else:
+                    frame_ids = []
+                    for start_id in np.linspace(0, end_frame - clip_length * clip_stride, num_clips, dtype=int):
+                        frame_ids.extend(np.arange(start_id, start_id + clip_length * clip_stride, clip_stride))
+                    #print('frame_ids:', frame_ids)
+                    frames = video_loader_by_frames(self.root, vid_path, frame_ids)
+            return frames, action_list, vid_path
+        elif self.dataset == 'charades_ego_trimmed':
+            vid, start_second, end_second, narration = self.samples[i]
+            frames = video_loader(self.root, vid, start_second,
+                                  end_second=end_second,
+                                  chunk_len=-1,  # no chunk for CharadesEgo
+                                  fps=-1,  # could be variable fps
+                                  clip_length=clip_length,
+                                  jitter=is_training)
+            return frames, narration
+        else:
+            raise NotImplementedError
+    def __getitem__(self, i):
+        raise NotImplementedError
+    def __len__(self):
+        return len(self.samples)
+class VideoCaptionDatasetCLIP(VideoCaptionDatasetBase):
+    def __init__(self, dataset, root, metadata, transform=None,
+                 is_training=True, tokenizer=None,
+                 clip_length=32, clip_stride=2, sparse_sample=False,
+                 narration_selection='random',
+                 num_hard_negatives=0,
+                 subsample_stride=None):
+        super().__init__(dataset, root, metadata)
+        self.full_samples = self.samples.copy()
+        if isinstance(subsample_stride, int):
+            self.samples = self.samples[::subsample_stride]
+        self.transform = transform
+        self.is_training = is_training
+        self.tokenizer = tokenizer
+        self.clip_length = clip_length
+        self.clip_stride = clip_stride
+        self.sparse_sample = sparse_sample
+        self.narration_selection = narration_selection
+        self.num_hard_negatives = num_hard_negatives
+        if num_hard_negatives > 0:
+            assert self.dataset == 'htm_aa'
+    def __getitem__(self, i):
+        frames, caption = self.get_raw_item(
+            i, is_training=self.is_training,
+            clip_length=self.clip_length,
+            clip_stride=self.clip_stride,
+            sparse_sample=self.sparse_sample,
+            narration_selection=self.narration_selection,
+        )
+        # ek100_mir will also output relevancy value
+        if isinstance(caption, tuple):
+            caption, relevancy = caption
+        else:
+            relevancy = 0.
+        # apply transformation
+        if self.transform is not None:
+            frames = self.transform(frames)
+        # tokenize caption
+        if self.tokenizer is not None:
+            caption = self.tokenizer(caption)
+        if isinstance(caption, tuple):
+            caption, mask = caption
+            return frames, caption, mask, relevancy
+        else:
+            return frames, caption, relevancy
+class VideoCaptionDatasetMCQ(VideoCaptionDatasetBase):
+    def __init__(self, dataset, root, metadata, transform=None,
+                 is_training=True, tokenizer=None,
+                 clip_length=32, clip_stride=2, sparse_sample=False,
+                 narration_selection='random'):
+        super().__init__(dataset, root, metadata)
+        self.full_samples = self.samples.copy()
+        self.transform = transform
+        self.is_training = is_training
+        self.tokenizer = tokenizer
+        self.clip_length = clip_length
+        self.clip_stride = clip_stride
+        self.sparse_sample = sparse_sample
+        self.narration_selection = narration_selection
+    def __getitem__(self, i):
+        textQuery, frames_options, narration_options, answerIndex, q_type = self.get_raw_item(
+            i, is_training=self.is_training,
+            clip_length=self.clip_length,
+            clip_stride=self.clip_stride,
+            sparse_sample=self.sparse_sample,
+            narration_selection=self.narration_selection,
+        )
+        # apply transformation
+        if self.transform is not None:
+            frames_options = [self.transform(frames) for frames in frames_options]
+        # tokenize caption
+        if self.tokenizer is not None:
+            textQuery = self.tokenizer(textQuery)
+            narration_options = self.tokenizer(narration_options)
+            if isinstance(textQuery, tuple):
+                textQuery, mask_query = textQuery
+                narration_options, mask_options = narration_options
+                return (
+                    textQuery, torch.stack(frames_options, dim=0),
+                    narration_options, answerIndex, q_type,
+                    mask_query, mask_options
+                )
+            else:
+                return textQuery, torch.stack(frames_options, dim=0), narration_options, answerIndex, q_type
+class VideoClassyDataset(VideoCaptionDatasetBase):
+    def __init__(
+        self, dataset, root, metadata, transform=None,
+        is_training=True, label_mapping=None,
+        num_clips=1,
+        clip_length=32, clip_stride=2,
+        sparse_sample=False,
+        is_trimmed=True,
+    ):
+        super().__init__(dataset, root, metadata, is_trimmed=is_trimmed)
+        self.transform = transform
+        self.is_training = is_training
+        self.label_mapping = label_mapping
+        self.num_clips = num_clips
+        self.clip_length = clip_length
+        self.clip_stride = clip_stride
+        self.sparse_sample = sparse_sample
+    def __getitem__(self, i):
+        frames, label, vid_path = self.get_raw_item(
+            i, is_training=self.is_training,
+            num_clips=self.num_clips,
+            clip_length=self.clip_length,
+            clip_stride=self.clip_stride,
+            sparse_sample=self.sparse_sample,
+        )
+        # apply transformation
+        if self.transform is not None:
+            frames = self.transform(frames)
+        if self.label_mapping is not None:
+            if isinstance(label, list):
+                # multi-label case
+                res_array = np.zeros(len(self.label_mapping))
+                for lbl in label:
+                    res_array[self.label_mapping[lbl]] = 1.
+                label = res_array
+            else:
+                label = self.label_mapping[label]
+        return frames, label, vid_path
+def get_dataset(train_transform, tokenizer, cfg, is_training=True):
+    narration_selection = cfg.get('narration_selection', 'random')
+    num_hard_neg = cfg.get('num_hard_neg', 0)
+    data_cfg = cfg['data']
+    if cfg['model']['arch'].startswith('CLIP') or cfg['model']['arch'].startswith('VCLM'):
+        if is_training:
+            metadata = data_cfg['metadata']
+        else:
+            metadata = data_cfg['metadata_val']
+        return VideoCaptionDatasetCLIP(
+            data_cfg['dataset'], data_cfg['root'], metadata, train_transform,
+            is_training=is_training,
+            tokenizer=tokenizer,
+            clip_length=data_cfg['clip_length'], clip_stride=data_cfg['clip_stride'],
+            sparse_sample=data_cfg['sparse_sample'],
+            narration_selection=narration_selection,
+            num_hard_negatives=num_hard_neg
+        )
+    else:
+        raise NotImplementedError
+def get_downstream_dataset(transform, tokenizer, cfg, is_training=True, num_clips=0, label_mapping=None):
+    data_cfg = cfg['data']
+    n_clips = num_clips if num_clips > 0 else data_cfg['num_clips']
+    if is_training:
+        metadata = data_cfg['metadata']
+        return VideoClassyDataset(
+            data_cfg['dataset'], data_cfg['root'], metadata, transform,
+            is_training=True, label_mapping=label_mapping,
+            num_clips=n_clips,
+            clip_length=data_cfg['clip_length'], clip_stride=data_cfg['clip_stride'],
+            sparse_sample=data_cfg['sparse_sample'],
+        )
+    else:
+        metadata = data_cfg['metadata_val']
+        return VideoClassyDataset(
+            data_cfg['dataset'], data_cfg['root'], metadata, transform,
+            is_training=False, label_mapping=label_mapping,
+            num_clips=n_clips,
+            clip_length=data_cfg['clip_length'], clip_stride=data_cfg['clip_stride'],
+            sparse_sample=data_cfg['sparse_sample'],
+            is_trimmed=not data_cfg['dataset'] == 'charades_ego'
+        )

svitt/evaluation.py ADDED Viewed

	@@ -0,0 +1,36 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import numpy as np
+import torch
+def accuracy(output, target, topk=(1,)):
+    """Computes the accuracy over the k top predictions for the specified values of k"""
+    with torch.no_grad():
+        maxk = max(topk)
+        batch_size = target.size(0)
+        _, pred = output.topk(maxk, 1, True, True)
+        pred = pred.t()
+        correct = pred.eq(target.reshape(1, -1).expand_as(pred))
+        res = []
+        for k in topk:
+            correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True)
+            res.append(correct_k.mul_(100.0 / batch_size))
+        return res
+def get_mean_accuracy(cm):
+    list_acc = []
+    for i in range(len(cm)):
+        acc = 0
+        if cm[i, :].sum() > 0:
+            acc = cm[i, i] / cm[i, :].sum()
+        list_acc.append(acc)
+    return 100 * np.mean(list_acc), 100 * np.trace(cm) / np.sum(cm)

svitt/evaluation_charades.py ADDED Viewed

	@@ -0,0 +1,56 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import numpy as np
+def compute_map(submission_array, gt_array):
+    """ Returns mAP, weighted mAP, and AP array """
+    m_aps = []
+    n_classes = submission_array.shape[1]
+    for oc_i in range(n_classes):
+        sorted_idxs = np.argsort(-submission_array[:, oc_i])
+        tp = gt_array[:, oc_i][sorted_idxs] == 1
+        fp = np.invert(tp)
+        n_pos = tp.sum()
+        if n_pos < 0.1:
+            m_aps.append(float('nan'))
+            continue
+        fp.sum()
+        f_pcs = np.cumsum(fp)
+        t_pcs = np.cumsum(tp)
+        prec = t_pcs / (f_pcs+t_pcs).astype(float)
+        avg_prec = 0
+        for i in range(submission_array.shape[0]):
+            if tp[i]:
+                avg_prec += prec[i]
+        m_aps.append(avg_prec / n_pos.astype(float))
+    m_aps = np.array(m_aps)
+    #m_ap = np.mean(m_aps)
+    m_ap = m_aps[~np.isnan(m_aps)]
+    print(f'num of available classes: {len(m_ap)}')
+    m_ap = m_ap.mean() # compute mean w/o nan
+    w_ap = (m_aps * gt_array.sum(axis=0) / gt_array.sum().sum().astype(float))
+    return m_ap, w_ap, m_aps
+def charades_map(submission_array, gt_array):
+    """
+    Approximate version of the charades evaluation function
+    For precise numbers, use the submission file with the official matlab script
+    """
+    fix = submission_array.copy()
+    empty = np.sum(gt_array, axis=1) == 0
+    fix[empty, :] = np.NINF
+    return compute_map(fix, gt_array)
+def create_submission(video_list, predictions, out_file):
+    assert len(video_list) == predictions.shape[0]
+    with open(out_file, 'w') as f:
+        for i, video_id in enumerate(video_list):
+            pred_str = ' '.join(map(lambda x: str(x), predictions[i].tolist()))
+            f.write('{} {}\n\n'.format(video_id, pred_str))

svitt/model.py ADDED Viewed

	@@ -0,0 +1,340 @@

+from svitt.utils import (
+    interpolate_pos_embed,
+    interpolate_pos_relative_bias_beit_3d,
+)
+from omegaconf import OmegaConf
+from transformers import ViTModel, ViTConfig
+from svitt.sparse_config import BertConfig, BeitConfig
+from svitt.sparse_xbeit import BeitModel
+from svitt.sparse_xbert import BertModel, BertForMaskedLM
+import torch
+from torch import nn
+import torch.nn.functional as F
+class SViTT(nn.Module):
+    """Common utils shared by pretraining and downstream retrieval"""
+    def __init__(self, config=None, tokenizer=None, pretrain=True, **kwargs):
+        super().__init__()
+        self.config = config
+        self.tokenizer = tokenizer
+        self.embed_dim = config.embed_dim
+        self.vision_width = 768
+        self.text_width = 768
+        self.pretrain = pretrain
+        self.vision_encoder, self.vision_layernorm = self.build_vision_encoder()
+        self.text_encoder = self.build_text_encoder()
+        self.vision_proj = nn.Linear(self.vision_width, self.embed_dim)
+        self.text_proj = nn.Linear(self.text_width, self.embed_dim)
+        self.temp = nn.Parameter(torch.ones([]) * config.temp)
+        self.itm_head = nn.Linear(self.text_width, 2)
+    def build_text_encoder(self):
+        bert_config = BertConfig.from_json_file(self.config.bert_config)
+        # Override params for sparse vision encoder
+        model_args = getattr(self.config, 'text_encoder_args', {})
+        if model_args:
+            model_args = OmegaConf.to_object(model_args)
+            bert_config.update(model_args)
+        if self.pretrain:
+            text_encoder, _ = BertForMaskedLM.from_pretrained(
+                self.config.text_encoder, config=bert_config,
+                output_loading_info=True
+            )
+        else:
+            text_encoder, _ = BertModel.from_pretrained(
+                self.config.text_encoder, config=bert_config,
+                add_pooling_layer=False, output_loading_info=True
+            )
+        return text_encoder
+    def build_vision_encoder(self):
+        # if self.config.vit_type in ["beit", "deit", "vit", "vit32"]:
+        if self.config.vit_type in ["beit"]:
+            vision_encoder = self.build_huggingface_vit_with_image_size(
+                self.config.vit_name_or_pretrained_path, self.config.image_res,)
+        else:
+            raise ValueError(f"Unknown vit type {self.config.vit_type}")
+        # add layernorm for normalizing BEiT outputs hidden states
+        vision_layernorm = None
+        if self.config.vit_type == "beit":
+            vision_layernorm = nn.LayerNorm(self.vision_width, eps=1e-12)
+        return vision_encoder, vision_layernorm
+    # @classmethod
+    # def build_huggingface_vit_with_image_size(cls, model_card: str, image_size: int):
+    def build_huggingface_vit_with_image_size(self, model_card: str, image_size: int):
+        """Build a vit model from huggingface hub, also interpolate pos_embed when needed.
+        Args:
+            model_card: name in huggingface hub, e.g., `facebook/deit-base-patch16-224`
+            image_size: new image size, may be different from pre-training image_size of `model_card`
+        ref: https://github.com/huggingface/transformers/issues/12167#issuecomment-861356232
+        """
+        is_beit = "beit" in model_card
+        if "beit" in model_card:
+            model_cls, config_cls = BeitModel, BeitConfig
+        elif "deit" in model_card or "vit" in model_card:
+            # the deit model we use is loaded in vit arch,
+            # see https://huggingface.co/facebook/deit-base-patch16-224#how-to-use
+            model_cls, config_cls = ViTModel, ViTConfig
+        else:
+            raise ValueError(f"Unexpected model_card: {model_card}")
+        # BEiT uses average pooled tokens instead of [CLS] used by other models
+        tmp_model = model_cls.from_pretrained(model_card, add_pooling_layer=is_beit)
+        state_dict = tmp_model.state_dict()
+        del tmp_model
+        # Override params for sparse vision encoder
+        model_args = getattr(self.config, 'vision_encoder_args', {})
+        if model_args:
+            model_args = OmegaConf.to_object(model_args)
+        model_config = config_cls.from_pretrained(
+            model_card,
+            image_size=image_size,
+            **model_args,
+        )
+        model = model_cls(config=model_config, add_pooling_layer=is_beit, num_frames=self.config.video_input.num_frames)
+        if is_beit:
+            # interpolate relative pos bias
+            state_dict = interpolate_pos_relative_bias_beit_3d(
+                state_dict_old=state_dict,
+                state_dict_new=model.state_dict(),
+                patch_shape_new=model.window_size
+            )
+        else:
+            # interpolate pos_embed and load weights to new model
+            state_dict["embeddings.position_embeddings"] = interpolate_pos_embed(
+                pos_embed_old=state_dict["embeddings.position_embeddings"],
+                pos_embed_new=model.embeddings.position_embeddings,
+                num_patches_new=model.embeddings.patch_embeddings.num_patches
+            )
+        msg = model.load_state_dict(state_dict, strict=False)
+        return model
+    def get_text_encoder(self):
+        """get text encoder, used for text and cross-modal encoding"""
+        encoder = self.text_encoder
+        return encoder.bert if hasattr(encoder, "bert") else encoder
+    def encode_image(self, video, output_token_idx=False, output_attentions=False):
+        video_embeds = self.vision_encoder(video, output_token_idx=output_token_idx, output_attentions=output_attentions)  # (bsz, seq_len, d)
+        if self.vision_layernorm is not None:  # only for BEiT mean-pooling
+            video_embeds.last_hidden_state = self.vision_layernorm(video_embeds.last_hidden_state)
+        if output_token_idx:
+            token_idx = video_embeds.token_idx
+        if output_attentions:
+            attentions = video_embeds.attentions
+        if self.config.vit_type == "beit":
+            pooled_video_embeds = video_embeds.pooler_output  # (bsz*num_frms, d)
+            video_embeds = video_embeds.last_hidden_state  # (bsz*num_frms, L, d)
+        else:
+            video_embeds = video_embeds.last_hidden_state
+            pooled_video_embeds = video_embeds[:, 0]
+        outputs = (video_embeds, pooled_video_embeds)
+        if output_token_idx:
+            outputs += (token_idx,)
+        if output_attentions:
+            outputs += (attentions,)
+        return outputs
+    def _encode_image(self, image):
+        bsz, num_frms, c, h, w = image.shape  # `num_frms` could be changing for image (=1) or video (e.g., =4)
+        image = image.view(bsz*num_frms, c, h, w)
+        image_embeds = self.vision_encoder(image)
+        if self.vision_layernorm is not None:  # only for BEiT mean-pooling
+            image_embeds.last_hidden_state = self.vision_layernorm(image_embeds.last_hidden_state)
+        if self.config.vit_type == "beit":
+            pooled_image_embeds = image_embeds.pooler_output  # (bsz*num_frms, d)
+            image_embeds = image_embeds.last_hidden_state  # (bsz*num_frms, L, d)
+        else:
+            image_embeds = image_embeds.last_hidden_state
+            pooled_image_embeds = image_embeds[:, 0]
+        image_embeds = image_embeds.view(bsz, num_frms, -1, self.vision_width)  # (bsz, num_frms, L, d)
+        pooled_image_embeds = pooled_image_embeds.view(bsz, num_frms, self.vision_width) \
+            if pooled_image_embeds is not None else None  # (bsz, num_frms, d)
+        return image_embeds, pooled_image_embeds
+    def encode_text(self, text):
+        text_output = self.get_text_encoder()(
+            text.input_ids,
+            attention_mask=text.attention_mask,
+            return_dict=True,
+            mode='text'
+        )
+        text_embeds = text_output.last_hidden_state
+        pooled_text_embeds = text_embeds[:, 0]
+        return text_embeds, pooled_text_embeds
+    @torch.no_grad()
+    def clip_contrastive_temperature(self, min_val=0.001, max_val=0.5):
+        """Seems only used during pre-training"""
+        self.temp.clamp_(min_val, max_val)
+    @torch.no_grad()
+    def get_mask(self, sim, idx=None, normalize=False):
+        """
+        sim: (N, N)
+        idx: (N, )
+        normalize: bool, make row sum equal to 1
+        """
+        if idx is not None:
+            idx = idx.view(-1, 1)
+            mask = torch.eq(idx, idx.T).to(sim.dtype)
+            if normalize:
+                mask = mask / mask.sum(1, keepdim=True)
+        else:
+            mask = torch.zeros_like(sim)
+            mask.fill_diagonal_(1)
+        return mask  # `1` mark valid/matched location
+    def get_contrastive_loss(self, pooled_image_embeds, pooled_text_embeds, idx=None):
+        sim_i2t, sim_t2i = self.get_sim(
+            pooled_image_embeds, pooled_text_embeds, t=self.temp)
+        with torch.no_grad():
+            sim_i2t_targets = self.get_mask(sim_i2t, idx=idx, normalize=True)
+            sim_t2i_targets = sim_i2t_targets
+        loss_i2t = -torch.sum(
+            F.log_softmax(sim_i2t, dim=1) * sim_i2t_targets, dim=1).mean()
+        loss_t2i = -torch.sum(
+            F.log_softmax(sim_t2i, dim=1) * sim_t2i_targets, dim=1).mean()
+        loss_ita = (loss_i2t + loss_t2i) / 2
+        return loss_ita, sim_i2t, sim_t2i
+    def get_sim(self, pooled_image_embeds, pooled_text_embeds, t=1):
+        """
+        Args:
+            pooled_image_embeds: (bsz, num_frms, d)
+            pooled_text_embeds: (bsz, d)
+            t: temperature
+        """
+        image_proj = self.vision_proj
+        text_proj = self.text_proj
+        image_feat = F.normalize(image_proj(pooled_image_embeds), dim=-1)
+        text_feat = F.normalize(text_proj(pooled_text_embeds), dim=-1)
+        if image_feat.ndim == 3:
+            sim_i2t = torch.einsum("mld,nd->mln", image_feat, text_feat).mean(1) / t  # (N, N)
+        else:
+            sim_i2t = torch.einsum("md,nd ->mn", image_feat, text_feat) / t  # (N, N)
+        sim_t2i = sim_i2t.T
+        return sim_i2t, sim_t2i
+    def get_itm_loss(self,
+            sim_i2t,
+            sim_t2i,
+            text_embeds,
+            text_atts,
+            image_embeds,
+            image_atts,
+            idx=None,
+        ):
+        """
+        sim_i2t, sim_t2i: (N, N)
+        text_embeds, text_atts, image_embeds, image_atts: (N, *)
+        idx: (N, )
+        """
+        bsz = len(sim_i2t)
+        with torch.no_grad():
+            weights_i2t = F.softmax(sim_i2t+1e-4, dim=1)  # (N, N)
+            weights_t2i = F.softmax(sim_t2i+1e-4, dim=1)
+            mask = self.get_mask(sim_i2t, idx=idx).bool()
+            weights_i2t.masked_fill_(mask, 0)
+            weights_t2i.masked_fill_(mask, 0)
+        # select a negative image for each text
+        if self.config.itm_hard_neg:
+            img_neg_indices = torch.multinomial(weights_t2i, 1).squeeze() #RuntimeError: invalid multinomial distribution (sum of probabilities <= 0)
+        else:
+            img_neg_indices = self.get_rand_indices(mask, 1).squeeze()
+        image_embeds_neg = image_embeds[img_neg_indices]
+        # select a negative text for each image
+        if self.config.itm_hard_neg:
+            txt_neg_indices = torch.multinomial(weights_i2t, 1).squeeze()
+        else:
+            txt_neg_indices = self.get_rand_indices(mask, 1).squeeze()
+        text_embeds_neg = text_embeds[txt_neg_indices]
+        text_atts_neg = text_atts[txt_neg_indices]  # (N, L, d)
+        # embedding on local gpu
+        _text_embeds = text_embeds
+        _text_atts = text_atts
+        _image_embeds = image_embeds
+        _image_atts = image_atts
+        # concat embeddings
+        text_embeds_all = torch.cat([_text_embeds, _text_embeds, text_embeds_neg], dim=0)
+        text_atts_all = torch.cat([_text_atts, _text_atts, text_atts_neg], dim=0)
+        image_embeds_all = torch.cat([_image_embeds, image_embeds_neg, _image_embeds], dim=0)
+        image_atts_all = torch.cat([_image_atts, _image_atts, _image_atts], dim=0)
+        text_encoder = self.get_text_encoder()
+        output = text_encoder(
+            encoder_embeds=text_embeds_all,
+            attention_mask=text_atts_all,
+            encoder_hidden_states=image_embeds_all,
+            encoder_attention_mask=image_atts_all,
+            return_dict=True,
+            mode='fusion',
+        )
+        itm_embeds = output.last_hidden_state[:, 0]  # pos (N, d) + neg (2N, d)
+        loss_itm = self._get_itm_loss(itm_embeds, enc=self.itm_head)
+        itm_embeds_pos = itm_embeds[:bsz]  # (N, d)
+        return loss_itm, itm_embeds_pos
+    def _get_itm_loss(self, itm_embeds, enc):
+        """
+        itm_embeds: (3*N, D)
+        enc: nn.Module that projects cls_embeds
+        """
+        itm_scores = enc(itm_embeds)  # (3*N, 2)
+        bs = itm_scores.size(0) // 3
+        itm_labels = itm_scores.new_ones(3*bs, dtype=torch.long)
+        itm_labels[bs:] = 0
+        loss_itm = F.cross_entropy(itm_scores, itm_labels)
+        return loss_itm
+    def get_rand_indices(self, mask, k):
+        """
+        Args:
+            mask: (N, L) 0 indicates the positions that we can sample, 1 otherwise
+            k: #indices to sample at each row
+        Returns:
+            (N, k) indices
+        """
+        mask = mask.float()
+        mask = mask - 10000 * mask
+        mask += torch.randn_like(mask)
+        _, indices = torch.sort(mask, dim=1, descending=True)
+        indices = indices[:, :k].contiguous()
+        return indices

svitt/preprocess.py ADDED Viewed

	@@ -0,0 +1,86 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import csv
+from lavila.models.tokenizer import MyBertTokenizer, MyDistilBertTokenizer, MyGPT2Tokenizer, SimpleTokenizer
+def generate_label_map(dataset):
+    if dataset == 'ek100_cls':
+        print("Preprocess ek100 action label space")
+        vn_list = []
+        mapping_vn2narration = {}
+        for f in [
+            '/data/EK100/epic-kitchens-100-annotations/EPIC_100_train.csv',
+            '/data/EK100/epic-kitchens-100-annotations/EPIC_100_validation.csv',
+        ]:
+            csv_reader = csv.reader(open(f))
+            _ = next(csv_reader)  # skip the header
+            for row in csv_reader:
+                vn = '{}:{}'.format(int(row[10]), int(row[12]))
+                narration = row[8]
+                if vn not in vn_list:
+                    vn_list.append(vn)
+                if vn not in mapping_vn2narration:
+                    mapping_vn2narration[vn] = [narration]
+                else:
+                    mapping_vn2narration[vn].append(narration)
+                # mapping_vn2narration[vn] = [narration]
+        vn_list = sorted(vn_list)
+        print('# of action= {}'.format(len(vn_list)))
+        mapping_vn2act = {vn: i for i, vn in enumerate(vn_list)}
+        labels = [list(set(mapping_vn2narration[vn_list[i]])) for i in range(len(mapping_vn2act))]
+        print(labels[:5])
+    elif dataset == 'charades_ego':
+        print("=> preprocessing charades_ego action label space")
+        vn_list = []
+        labels = []
+        with open('data/charades_ego/Charades_v1_classes.txt') as f:
+            csv_reader = csv.reader(f)
+            for row in csv_reader:
+                vn = row[0][:4]
+                vn_list.append(vn)
+                narration = row[0][5:]
+                labels.append(narration)
+        mapping_vn2act = {vn: i for i, vn in enumerate(vn_list)}
+        print(labels[:5])
+    elif dataset == 'egtea':
+        print("=> preprocessing egtea action label space")
+        labels = []
+        with open('/data/EGTEA/action_idx.txt') as f:
+            for row in f:
+                row = row.strip()
+                narration = ' '.join(row.split(' ')[:-1])
+                labels.append(narration.replace('_', ' ').lower())
+                # labels.append(narration)
+        mapping_vn2act = {label: i for i, label in enumerate(labels)}
+        print(len(labels), labels[:5])
+    else:
+        raise NotImplementedError
+    return labels, mapping_vn2act
+def generate_tokenizer(model):
+    if model.endswith('DISTILBERT_BASE'):
+        tokenizer = MyDistilBertTokenizer('distilbert-base-uncased')
+    elif model.endswith('BERT_BASE'):
+        tokenizer = MyBertTokenizer('bert-base-uncased')
+    elif model.endswith('BERT_LARGE'):
+        tokenizer = MyBertTokenizer('bert-large-uncased')
+    elif model.endswith('GPT2'):
+        tokenizer = MyGPT2Tokenizer('gpt2', add_bos=True)
+    elif model.endswith('GPT2_MEDIUM'):
+        tokenizer = MyGPT2Tokenizer('gpt2-medium', add_bos=True)
+    elif model.endswith('GPT2_LARGE'):
+        tokenizer = MyGPT2Tokenizer('gpt2-large', add_bos=True)
+    elif model.endswith('GPT2_XL'):
+        tokenizer = MyGPT2Tokenizer('gpt2-xl', add_bos=True)
+    else:
+        print("Using SimpleTokenizer because of model '{}'. "
+              "Please check if this is what you want".format(model))
+        tokenizer = SimpleTokenizer()
+    return tokenizer

svitt/sparse_config.py ADDED Viewed

	@@ -0,0 +1,351 @@

+# coding=utf-8
+# Copyright Microsoft Research and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from collections import OrderedDict
+from typing import Mapping
+from transformers.configuration_utils import PretrainedConfig
+from transformers.onnx import OnnxConfig
+BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "bert-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/config.json",
+    "bert-large-uncased": "https://huggingface.co/bert-large-uncased/resolve/main/config.json",
+    "bert-base-cased": "https://huggingface.co/bert-base-cased/resolve/main/config.json",
+    "bert-large-cased": "https://huggingface.co/bert-large-cased/resolve/main/config.json",
+    "bert-base-multilingual-uncased": "https://huggingface.co/bert-base-multilingual-uncased/resolve/main/config.json",
+    "bert-base-multilingual-cased": "https://huggingface.co/bert-base-multilingual-cased/resolve/main/config.json",
+    "bert-base-chinese": "https://huggingface.co/bert-base-chinese/resolve/main/config.json",
+    "bert-base-german-cased": "https://huggingface.co/bert-base-german-cased/resolve/main/config.json",
+    "bert-large-uncased-whole-word-masking": "https://huggingface.co/bert-large-uncased-whole-word-masking/resolve/main/config.json",
+    "bert-large-cased-whole-word-masking": "https://huggingface.co/bert-large-cased-whole-word-masking/resolve/main/config.json",
+    "bert-large-uncased-whole-word-masking-finetuned-squad": "https://huggingface.co/bert-large-uncased-whole-word-masking-finetuned-squad/resolve/main/config.json",
+    "bert-large-cased-whole-word-masking-finetuned-squad": "https://huggingface.co/bert-large-cased-whole-word-masking-finetuned-squad/resolve/main/config.json",
+    "bert-base-cased-finetuned-mrpc": "https://huggingface.co/bert-base-cased-finetuned-mrpc/resolve/main/config.json",
+    "bert-base-german-dbmdz-cased": "https://huggingface.co/bert-base-german-dbmdz-cased/resolve/main/config.json",
+    "bert-base-german-dbmdz-uncased": "https://huggingface.co/bert-base-german-dbmdz-uncased/resolve/main/config.json",
+    "cl-tohoku/bert-base-japanese": "https://huggingface.co/cl-tohoku/bert-base-japanese/resolve/main/config.json",
+    "cl-tohoku/bert-base-japanese-whole-word-masking": "https://huggingface.co/cl-tohoku/bert-base-japanese-whole-word-masking/resolve/main/config.json",
+    "cl-tohoku/bert-base-japanese-char": "https://huggingface.co/cl-tohoku/bert-base-japanese-char/resolve/main/config.json",
+    "cl-tohoku/bert-base-japanese-char-whole-word-masking": "https://huggingface.co/cl-tohoku/bert-base-japanese-char-whole-word-masking/resolve/main/config.json",
+    "TurkuNLP/bert-base-finnish-cased-v1": "https://huggingface.co/TurkuNLP/bert-base-finnish-cased-v1/resolve/main/config.json",
+    "TurkuNLP/bert-base-finnish-uncased-v1": "https://huggingface.co/TurkuNLP/bert-base-finnish-uncased-v1/resolve/main/config.json",
+    "wietsedv/bert-base-dutch-cased": "https://huggingface.co/wietsedv/bert-base-dutch-cased/resolve/main/config.json",
+    # See all BERT models at https://huggingface.co/models?filter=bert
+}
+class BertConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`BertModel`] or a
+    [`TFBertModel`]. It is used to instantiate a BERT model according to the specified arguments,
+    defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration
+    to that of the BERT [bert-base-uncased](https://huggingface.co/bert-base-uncased) architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 30522):
+            Vocabulary size of the BERT model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`BertModel`] or
+            [`TFBertModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string,
+            `"gelu"`, `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed when calling [`BertModel`] or
+            [`TFBertModel`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
+            Type of position embedding. Choose one of `"absolute"`, `"relative_key"`,
+            `"relative_key_query"`. For positional embeddings use `"absolute"`. For more information on
+            `"relative_key"`, please refer to [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155). For more information on `"relative_key_query"`, please refer to
+            *Method 4* in [Improve Transformer Models with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        classifier_dropout (`float`, *optional*):
+            The dropout ratio for the classification head.
+    Examples:
+    ```python
+    >>> from transformers import BertModel, BertConfig
+    >>> # Initializing a BERT bert-base-uncased style configuration
+    >>> configuration = BertConfig()
+    >>> # Initializing a model from the bert-base-uncased style configuration
+    >>> model = BertModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "bert"
+    def __init__(
+        self,
+        vocab_size=30522,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=2,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        pad_token_id=0,
+        position_embedding_type="absolute",
+        use_cache=True,
+        classifier_dropout=None,
+        token_keep_rate=1,
+        token_keep_strategy='cls_attn',
+        token_drop_loc=[9],
+        **kwargs
+    ):
+        super().__init__(pad_token_id=pad_token_id, **kwargs)
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.position_embedding_type = position_embedding_type
+        self.use_cache = use_cache
+        self.classifier_dropout = classifier_dropout
+        self.token_keep_rate = token_keep_rate
+        self.token_keep_strategy = token_keep_strategy
+        self.token_drop_loc = token_drop_loc
+class BertOnnxConfig(OnnxConfig):
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        return OrderedDict(
+            [
+                ("input_ids", {0: "batch", 1: "sequence"}),
+                ("attention_mask", {0: "batch", 1: "sequence"}),
+                ("token_type_ids", {0: "batch", 1: "sequence"}),
+            ]
+        )
+BEIT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "microsoft/beit-base-patch16-224-in22k": "https://huggingface.co/microsoft/beit-base-patch16-224-in22k/resolve/main/config.json",
+    # See all BEiT models at https://huggingface.co/models?filter=beit
+}
+class BeitConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`BeitModel`]. It is used to
+    instantiate an BEiT model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the BEiT
+    [microsoft/beit-base-patch16-224-in22k](https://huggingface.co/microsoft/beit-base-patch16-224-in22k)
+    architecture.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 8092):
+            Vocabulary size of the BEiT model. Defines the number of different image tokens that can be used during
+            pre-training.
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string,
+            `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        image_size (`int`, *optional*, defaults to `224`):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to `16`):
+            The size (resolution) of each patch.
+        num_channels (`int`, *optional*, defaults to `3`):
+            The number of input channels.
+        use_mask_token (`bool`, *optional*, defaults to `False`):
+            Whether to use a mask token for masked image modeling.
+        use_absolute_position_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to use BERT-style absolute position embeddings.
+        use_relative_position_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use T5-style relative position embeddings in the self-attention layers.
+        use_shared_relative_position_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use the same relative position embeddings across all self-attention layers of the Transformer.
+        layer_scale_init_value (`float`, *optional*, defaults to 0.1):
+            Scale to use in the self-attention layers. 0.1 for base, 1e-5 for large. Set 0 to disable layer scale.
+        drop_path_rate (`float`, *optional*, defaults to 0.1):
+            Stochastic depth rate per sample (when applied in the main path of residual layers).
+        use_mean_pooling (`bool`, *optional*, defaults to `True`):
+            Whether to mean pool the final hidden states of the patches instead of using the final hidden state of the
+            CLS token, before applying the classification head.
+        out_indices (`List[int]`, *optional*, defaults to `[3, 5, 7, 11]`):
+            Indices of the feature maps to use for semantic segmentation.
+        pool_scales (`Tuple[int]`, *optional*, defaults to `[1, 2, 3, 6]`):
+            Pooling scales used in Pooling Pyramid Module applied on the last feature map.
+        use_auxiliary_head (`bool`, *optional*, defaults to `True`):
+            Whether to use an auxiliary head during training.
+        auxiliary_loss_weight (`float`, *optional*, defaults to 0.4):
+            Weight of the cross-entropy loss of the auxiliary head.
+        auxiliary_channels (`int`, *optional*, defaults to 256):
+            Number of channels to use in the auxiliary head.
+        auxiliary_num_convs (`int`, *optional*, defaults to 1):
+            Number of convolutional layers to use in the auxiliary head.
+        auxiliary_concat_input (`bool`, *optional*, defaults to `False`):
+            Whether to concatenate the output of the auxiliary head with the input before the classification layer.
+        semantic_loss_ignore_index (`int`, *optional*, defaults to 255):
+            The index that is ignored by the loss function of the semantic segmentation model.
+    Example:
+    ```python
+    >>> from transformers import BeitModel, BeitConfig
+    >>> # Initializing a BEiT beit-base-patch16-224-in22k style configuration
+    >>> configuration = BeitConfig()
+    >>> # Initializing a model from the beit-base-patch16-224-in22k style configuration
+    >>> model = BeitModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "beit"
+    def __init__(
+        self,
+        vocab_size=8192,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.0,
+        attention_probs_dropout_prob=0.0,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        is_encoder_decoder=False,
+        image_size=224,
+        patch_size=16,
+        num_channels=3,
+        use_mask_token=False,
+        use_absolute_position_embeddings=False,
+        use_relative_position_bias=False,
+        use_shared_relative_position_bias=False,
+        layer_scale_init_value=0.1,
+        drop_path_rate=0.1,
+        use_mean_pooling=True,
+        out_indices=[3, 5, 7, 11],
+        pool_scales=[1, 2, 3, 6],
+        use_auxiliary_head=True,
+        auxiliary_loss_weight=0.4,
+        auxiliary_channels=256,
+        auxiliary_num_convs=1,
+        auxiliary_concat_input=False,
+        semantic_loss_ignore_index=255,
+        token_keep_rate=1,
+        token_keep_strategy='cls_attn',
+        token_drop_loc=[3, 6, 9],
+        sparse_random_attn=None,
+        sparse_local_attn=1,
+        attn_block_size=1,
+        num_cls_tokens=1,
+        token_3d_order='none',
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.use_mask_token = use_mask_token
+        self.use_absolute_position_embeddings = use_absolute_position_embeddings
+        self.use_relative_position_bias = use_relative_position_bias
+        self.use_shared_relative_position_bias = use_shared_relative_position_bias
+        self.layer_scale_init_value = layer_scale_init_value
+        self.drop_path_rate = drop_path_rate
+        self.use_mean_pooling = use_mean_pooling
+        # decode head attributes (semantic segmentation)
+        self.out_indices = out_indices
+        self.pool_scales = pool_scales
+        # auxiliary head attributes (semantic segmentation)
+        self.use_auxiliary_head = use_auxiliary_head
+        self.auxiliary_loss_weight = auxiliary_loss_weight
+        self.auxiliary_channels = auxiliary_channels
+        self.auxiliary_num_convs = auxiliary_num_convs
+        self.auxiliary_concat_input = auxiliary_concat_input
+        self.semantic_loss_ignore_index = semantic_loss_ignore_index
+        # node sparsification
+        self.token_keep_rate = token_keep_rate
+        self.token_keep_strategy = token_keep_strategy
+        self.token_drop_loc = token_drop_loc
+        # edge sparsification
+        self.sparse_random_attn = sparse_random_attn
+        self.sparse_local_attn = sparse_local_attn
+        self.attn_block_size = attn_block_size
+        self.num_cls_tokens = num_cls_tokens
+        # token order
+        self.token_3d_order = token_3d_order

svitt/sparse_xbeit.py ADDED Viewed

	@@ -0,0 +1,1585 @@

+# coding=utf-8
+# Copyright 2021 Google AI, Ross Wightman, The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch BEiT model. """
+import collections.abc
+import math
+import numpy as np
+from dataclasses import dataclass
+from typing import Optional, Tuple
+import zCurve
+import hilbert
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import CrossEntropyLoss, MSELoss
+from einops import rearrange, repeat
+from transformers.activations import ACT2FN
+from transformers.file_utils import add_start_docstrings, add_start_docstrings_to_model_forward, replace_return_docstrings
+from transformers.modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, MaskedLMOutput, SequenceClassifierOutput
+from transformers.modeling_utils import PreTrainedModel, find_pruneable_heads_and_indices, prune_linear_layer
+from svitt.sparse_config import BeitConfig
+_CONFIG_FOR_DOC = "BeitConfig"
+_CHECKPOINT_FOR_DOC = "microsoft/beit-base-patch16-224"
+BEIT_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "microsoft/beit-base-patch16-224",
+    # See all BEiT models at https://huggingface.co/models?filter=beit
+]
+@dataclass
+class BeitModelOutputWithPooling(BaseModelOutputWithPooling):
+    """
+    Class for outputs of :class:`~transformers.BeitModel`.
+    Args:
+        last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        pooler_output (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, hidden_size)`):
+            Average of the last layer hidden states of the patch tokens (excluding the `[CLS]` token) if
+            `config.use_mean_pooling` is set to True. If set to False, then the final hidden state of the `[CLS]` token
+            will be returned.
+        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
+            sequence_length, sequence_length)`.
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+    token_idx: Optional[Tuple[torch.LongTensor]] = None
+@dataclass
+class BeitModelOutput(BaseModelOutput):
+    token_idx: Optional[Tuple[torch.LongTensor]] = None
+# Inspired by
+# https://github.com/rwightman/pytorch-image-models/blob/b9bd960a032c75ca6b808ddeed76bee5f3ed4972/timm/models/layers/helpers.py
+# From PyTorch internals
+def to_2tuple(x):
+    if isinstance(x, collections.abc.Iterable):
+        return x
+    return (x, x)
+# Based on https://github.com/rwightman/pytorch-image-models/blob/a2727c1bf78ba0d7b5727f5f95e37fb7f8866b1f/timm/models/layers/drop.py
+def drop_path(x, drop_prob: float = 0.0, training: bool = False):
+    """
+    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
+    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
+    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
+    argument.
+    """
+    if drop_prob == 0.0 or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = keep_prob + torch.rand(shape, dtype=x.dtype, device=x.device)
+    random_tensor.floor_()  # binarize
+    output = x.div(keep_prob) * random_tensor
+    return output
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+    def __init__(self, drop_prob=None):
+        super().__init__()
+        self.drop_prob = drop_prob
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+    def extra_repr(self) -> str:
+        return "p={}".format(self.drop_prob)
+# Based on timm implementation, which can be found here:
+# https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py
+class BeitEmbeddings(nn.Module):
+    """
+    Construct the CLS token, position and patch embeddings. Optionally, also the mask token.
+    """
+    def __init__(self, config):
+        super().__init__()
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
+        if config.use_mask_token:
+            self.mask_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
+        else:
+            self.mask_token = None
+        self.patch_embeddings = PatchEmbeddings(
+            image_size=config.image_size,
+            patch_size=config.patch_size,
+            num_channels=config.num_channels,
+            embed_dim=config.hidden_size,
+        )
+        num_patches = self.patch_embeddings.num_patches
+        if config.use_absolute_position_embeddings:
+            self.position_embeddings = nn.Parameter(torch.zeros(1, num_patches + 1, config.hidden_size))
+        else:
+            self.position_embeddings = None
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+    def forward(self, pixel_values, bool_masked_pos=None):
+        if pixel_values.ndim == 5:  # video input=
+            embeddings = self.patch_embeddings(pixel_values.flatten(0, 1))
+            embeddings = rearrange(embeddings, '(b m) n d -> b (m n) d', m=pixel_values.shape[1])
+        else:  # image input
+            embeddings = self.patch_embeddings(pixel_values)
+        batch_size, seq_len, _ = embeddings.size()
+        cls_tokens = self.cls_token.expand(batch_size, -1, -1)
+        if bool_masked_pos is not None:
+            mask_tokens = self.mask_token.expand(batch_size, seq_len, -1)
+            # replace the masked visual tokens by mask_tokens
+            w = bool_masked_pos.unsqueeze(-1).type_as(mask_tokens)
+            embeddings = embeddings * (1 - w) + mask_tokens * w
+        embeddings = torch.cat((cls_tokens, embeddings), dim=1)
+        if self.position_embeddings is not None:
+            embeddings = embeddings + self.position_embeddings
+        embeddings = self.dropout(embeddings)
+        return embeddings
+# Based on timm implementation, which can be found here:
+# https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py
+class PatchEmbeddings(nn.Module):
+    """
+    Image to Patch Embedding.
+    """
+    def __init__(self, image_size=224, patch_size=16, num_channels=3, embed_dim=768):
+        super().__init__()
+        image_size = to_2tuple(image_size)
+        patch_size = to_2tuple(patch_size)
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+        patch_shape = (image_size[0] // patch_size[0], image_size[1] // patch_size[1])
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_patches = num_patches
+        self.patch_shape = patch_shape
+        self.projection = nn.Conv2d(num_channels, embed_dim, kernel_size=patch_size, stride=patch_size)
+    def forward(self, pixel_values):
+        batch_size, num_channels, height, width = pixel_values.shape
+        # FIXME look at relaxing size constraints
+        if height != self.image_size[0] or width != self.image_size[1]:
+            raise ValueError(
+                f"Input image size ({height}*{width}) doesn't match model ({self.image_size[0]}*{self.image_size[1]})."
+            )
+        x = self.projection(pixel_values).flatten(2).transpose(1, 2)
+        return x
+class BeitSelfAttention(nn.Module):
+    def __init__(self, config, window_size=None):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                f"The hidden size {config.hidden_size,} is not a multiple of the number of attention "
+                f"heads {config.num_attention_heads}."
+            )
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=False)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        # sparse params
+        self.random_attn = config.sparse_random_attn
+        self.local_attn = config.sparse_local_attn
+        self.block_size = config.attn_block_size
+        self.num_cls_tokens = config.num_cls_tokens
+        if self.local_attn is not None and self.random_attn is not None:
+            self.num_kv_blocks = self.local_attn + self.random_attn
+        if window_size:
+            self.relative_position_bias = BeitRelativePositionBias3D(config, window_size=window_size)
+        else:
+            self.relative_position_bias = None
+    def split_heads(self, x):
+        return rearrange(x, 'b n (h d) -> b h n d', h=self.num_attention_heads)
+    def join_heads(self, x):
+        return rearrange(x, 'b h n d -> b n (h d)')
+    def blockify(self, x):
+        assert x.dim() == 4, f"Unsupported input shape {x.shape}"
+        seq_len = x.shape[2]
+        if seq_len % self.block_size > 0:  # seq_len not divisible by block_size, zero pad
+            pad_len = self.block_size - seq_len % self.block_size
+            x = nn.functional.pad(x, (0, 0, 0, pad_len))
+        else:
+            pad_len = 0
+        x = rearrange(x, 'b h (m n) d -> b h m n d', n=self.block_size)
+        return x, pad_len
+    def dense_attention(self, q, k, v, head_mask=None, relative_position_bias=None, q_idx=None, k_idx=None):
+        # q, k, v: (bsz, num_heads, seq_len, dims)
+        assert k.shape[2] == v.shape[2], "Key and value shapes mismatch"
+        sim = torch.einsum('b h i d, b h j d -> b h i j', q, k)
+        sim = sim / math.sqrt(self.attention_head_size)
+        # Add relative position bias if present.
+        if self.relative_position_bias is not None:
+            if q_idx is not None and q_idx.ndim == 2:
+                assert k_idx is not None and len(q_idx) == len(k_idx)
+                bias = torch.stack([
+                    self.relative_position_bias(from_idx=q_idx_, to_idx=k_idx_)
+                    for q_idx_, k_idx_ in zip(q_idx, k_idx)
+                ])
+            else:
+                bias = self.relative_position_bias(from_idx=q_idx, to_idx=k_idx).unsqueeze(0)
+            sim = sim + bias
+        # Add shared relative position bias if provided.
+        if relative_position_bias is not None:
+            sim = sim + relative_position_bias
+        # Normalize the attention scores to probabilities.
+        attn = sim.softmax(dim=-1)
+        attn = self.dropout(attn)
+        if head_mask is not None:
+            attn = attn * head_mask
+        out = torch.einsum('b h i j, b h j d -> b h i d', attn, v)
+        return out, attn
+    def _sparse_attn_relative_position_bias(self, q_idx, pad_q, attn_idx, group_len):
+        q_idx_blk = nn.functional.pad(q_idx, (0, pad_q)).view(-1, self.block_size)
+        attn_idx_flt = rearrange(q_idx_blk[attn_idx], 'm n j -> m (n j)')  # (seq_len, num_kv_blocks * group_len)
+        cls_idx = torch.arange(self.num_cls_tokens, device=q_idx.device)
+        cls_idx = repeat(cls_idx, 'n -> m n', m=len(attn_idx_flt))
+        attn_idx_flt = torch.cat((cls_idx, attn_idx_flt), dim=1)
+        attn_idx_flt = repeat(attn_idx_flt, 'm n -> (m i) n', i=group_len)
+        if pad_q > 0:
+            attn_idx_flt = attn_idx_flt[:-pad_q]
+        bias_flt = self.relative_position_bias(from_idx=q_idx, to_idx=attn_idx_flt)
+        if pad_q > 0:
+            bias_flt = nn.functional.pad(bias_flt, (0, 0, 0, pad_q))
+        return rearrange(bias_flt, 'h (m i) n -> h m i n', i=group_len)  # num_heads, seq_len, group_len, (num_kv_blocks * group_len + num_cls_tokens)
+    def sparse_attention(self, q, k, v, head_mask=None, relative_position_bias=None, q_idx=None, mimic_full=False):
+        assert self.local_attn == 0 or self.local_attn % 2 == 1, "Even local window size not supported"
+        assert k.shape[2] == v.shape[2], "Key and value shapes mismatch"
+        if not mimic_full:
+            cls_k, k = k[..., :self.num_cls_tokens, :], k[..., self.num_cls_tokens:, :]  # cls_k: (bsz, num_heads, num_cls_tokens, dims)
+            cls_v, v = v[..., :self.num_cls_tokens, :], v[..., self.num_cls_tokens:, :]
+        # pad token sequence to multiples of block_size
+        if mimic_full:
+            bsz, num_heads, seq_len, dims = q.shape
+        else:
+            q, pad_q = self.blockify(q)  # q: (bsz, num_heads, seq_len, group_len, dims)
+            k, pad_k = self.blockify(k)
+            v, pad_v = self.blockify(v)
+            bsz, num_heads, seq_len, group_len, dims = q.shape
+            # global attention
+            cls_sim = torch.einsum('b h n i d, b h j d -> b h n i j', q, cls_k)  # (bsz, num_heads, seq_len, group_len, num_cls_tokens)
+        if mimic_full:
+            sim = torch.einsum('b h i d, b h j d -> b h i j', q, k)
+            sim = sim / math.sqrt(self.attention_head_size)
+            sim = sim + self.relative_position_bias(from_idx=q_idx).unsqueeze(0)
+        else:
+            # initialize empty sim matrix
+            sim = torch.empty((bsz, num_heads, seq_len, self.num_kv_blocks, group_len, group_len), device=q.device)
+            attn_idx = torch.zeros((seq_len, self.num_kv_blocks), dtype=torch.int64, device=q.device)
+            # local window attention
+            cnt = 0
+            if self.local_attn > 0:
+                num_rolls = self.local_attn // 2
+                for r in range(-num_rolls, num_rolls + 1):
+                    sim[..., cnt, :, :] = torch.einsum('b h n i d, b h n j d -> b h n i j', q, k.roll(-r, dims=2))
+                    attn_idx[:, cnt] = torch.arange(seq_len, device=q.device).roll(r)
+                    cnt += 1
+            # random attention
+            if self.random_attn > 0:
+                # generate random attention pattern
+                rand = torch.rand((seq_len, seq_len), device=q.device)
+                if self.local_attn > 0:
+                    # avoid overlap with local attention
+                    for r in range(-num_rolls, num_rolls + 1):
+                        tgt_idx = list(i % seq_len for i in range(r, seq_len + r))
+                        rand[range(seq_len), tgt_idx] = 0
+                _, idx = rand.topk(self.random_attn, dim=-1)  # seq_len, random_attn
+                idx, _ = torch.sort(idx, dim=1)
+                attn_idx[:, cnt:] = idx
+                idx_ = repeat(idx, 'n m -> b h n m i d', b=bsz, h=num_heads, i=group_len, d=dims)
+                for r in range(self.random_attn):
+                    sim[..., cnt, :, :] = torch.einsum('b h n i d, b h n j d -> b h n i j', q, k.gather(2, idx_[..., r, :, :]))
+                    cnt += 1
+            sim = rearrange(sim, 'b h m n i j -> b h m i (n j)')  # (bsz, num_heads, seq_len, group_len, num_kv_blocks * group_len)
+            sim = torch.cat((cls_sim, sim), -1)
+            sim = sim / math.sqrt(self.attention_head_size)
+            # Add relative position bias if present.
+            # NOTE: we assume q and k (excluding cls) use same token indexing, for relative position embedding
+            if self.relative_position_bias is not None:
+                assert q_idx is not None, "query index required for relative position bias"
+                if q_idx.ndim == 2:
+                    # different indices for each sample
+                    bias = torch.stack([
+                        self._sparse_attn_relative_position_bias(q_idx_, pad_q, attn_idx, group_len)
+                        for q_idx_ in q_idx
+                    ])
+                else:
+                    bias = self._sparse_attn_relative_position_bias(q_idx, pad_q, attn_idx, group_len).unsqueeze(0)
+                sim = sim + bias
+        # Add shared relative position bias if provided.
+        if relative_position_bias is not None:
+            raise NotImplementedError
+            sim = sim + relative_position_bias
+        attn = sim.softmax(dim=-1)
+        attn = self.dropout(attn)
+        if head_mask is not None:
+            attn = attn * head_mask
+        # block attention
+        if mimic_full:
+            out = torch.einsum('b h i j, b h j d -> b h i d', attn, v)
+        else:
+            out = torch.empty((bsz, num_heads, seq_len, group_len, dims), device=q.device)
+            for m in range(seq_len):
+                v_row = torch.index_select(v, 2, attn_idx[m])
+                v_row = rearrange(v_row, 'b h n j d -> b h (n j) d')  # (bsz, num_heads, num_kv_blocks * group_len, dims)
+                v_row = torch.cat((cls_v, v_row), 2)
+                out[..., m, :, :] = torch.einsum('b h i j, b h j d -> b h i d', attn[..., m, :, :], v_row)
+            out = rearrange(out, 'b h n i d -> b h (n i) d')
+            if pad_q > 0:
+                out = out[..., :-pad_q, :]
+        return out, attn
+    def forward(self, hidden_states, head_mask=None, output_attentions=False, relative_position_bias=None, token_idx=None):
+        # compute qkv
+        q = self.split_heads(self.query(hidden_states))
+        k = self.split_heads(self.key(hidden_states))
+        v = self.split_heads(self.value(hidden_states))
+        # combine local token_idx with cls tokens
+        # NOTE: assume token_idx starts from 0
+        cls_q_idx = torch.arange(self.num_cls_tokens, device=q.device)
+        if token_idx is not None:
+            if token_idx.ndim == 2:
+                cls_q_idx = repeat(cls_q_idx, 'n -> b n', b=q.shape[0])
+            all_token_idx = torch.cat((cls_q_idx, token_idx + self.num_cls_tokens), dim=-1)
+        else:
+            all_token_idx = None
+        if self.random_attn is None:
+            outputs, attention_probs = self.dense_attention(q, k, v, head_mask=head_mask,
+                                                            relative_position_bias=relative_position_bias,
+                                                            q_idx=all_token_idx,
+                                                            k_idx=all_token_idx)
+            cls_attention_probs = attention_probs[..., :self.num_cls_tokens, :]
+        else:
+            cls_q, q = q[..., :self.num_cls_tokens, :], q[..., self.num_cls_tokens:, :]
+            # dense global attention (num_cls_tokens, seq_len)
+            cls_outputs, cls_attention_probs = self.dense_attention(cls_q, k, v, head_mask=head_mask,
+                                                                    relative_position_bias=relative_position_bias,
+                                                                    q_idx=cls_q_idx,
+                                                                    k_idx=all_token_idx)
+            # sparse local attention (local_seq_len, seq_len)
+            if token_idx is None:
+                token_idx = torch.arange(q.shape[-2], device=q.device)
+            outputs, attention_probs = self.sparse_attention(q, k, v, head_mask=head_mask,
+                                                             relative_position_bias=relative_position_bias,
+                                                             q_idx=token_idx + self.num_cls_tokens)
+            outputs = torch.cat((cls_outputs, outputs), dim=2)
+        outputs = self.join_heads(outputs)
+        outputs = (outputs, cls_attention_probs) if output_attentions else (outputs,)
+        return outputs
+class BeitSelfOutput(nn.Module):
+    """
+    The residual connection is defined in BeitLayer instead of here (as is the case with other models), due to the
+    layernorm applied before each block.
+    """
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+    def forward(self, hidden_states, input_tensor, gamma=None):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states
+class BeitAttention(nn.Module):
+    def __init__(self, config, window_size=None):
+        super().__init__()
+        self.attention = BeitSelfAttention(config, window_size=window_size)
+        self.output = BeitSelfOutput(config)
+        self.pruned_heads = set()
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads
+        )
+        # Prune linear layers
+        self.attention.query = prune_linear_layer(self.attention.query, index)
+        self.attention.key = prune_linear_layer(self.attention.key, index)
+        self.attention.value = prune_linear_layer(self.attention.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+        # Update hyper params and store pruned heads
+        self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads)
+        self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+    def forward(self, hidden_states, head_mask=None, output_attentions=False, relative_position_bias=None, token_idx=None):
+        self_outputs = self.attention(hidden_states, head_mask, output_attentions, relative_position_bias, token_idx)
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+class BeitIntermediate(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+class BeitOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states
+class BeitLayer(nn.Module):
+    """This corresponds to the Block class in the timm implementation."""
+    def __init__(self, config, window_size=None, drop_path_rate=0.0,
+                 token_keep_rate=1.0):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = BeitAttention(config, window_size=window_size)
+        self.intermediate = BeitIntermediate(config)
+        self.output = BeitOutput(config)
+        self.layernorm_before = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity()
+        self.layernorm_after = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        # sparse params
+        self.token_keep_rate = token_keep_rate
+        self.token_keep_strategy = config.token_keep_strategy
+        self.num_cls_tokens = config.num_cls_tokens
+        init_values = config.layer_scale_init_value
+        if init_values > 0:
+            self.lambda_1 = nn.Parameter(init_values * torch.ones((config.hidden_size)), requires_grad=True)
+            self.lambda_2 = nn.Parameter(init_values * torch.ones((config.hidden_size)), requires_grad=True)
+        else:
+            self.lambda_1, self.lambda_2 = None, None
+    def sparsify(self, x, attn):
+        x_cls, x_ = x[:, :self.num_cls_tokens], x[:, self.num_cls_tokens:]
+        assert 0 < self.token_keep_rate <= 1, "Expected keep rate in range (0, 1]"
+        left_tokens = math.ceil(self.token_keep_rate * x_.size(1))
+        if self.token_keep_strategy == 'cls_attn':
+            if len(attn.shape) == 4:
+                attn = attn.mean(1)  # pool over attention heads
+            cls_attn = attn[:, 0, self.num_cls_tokens:]
+            _, idx = torch.topk(cls_attn, left_tokens, dim=1)  # [B, left_tokens]
+        elif self.token_keep_strategy == 'random':
+            rand = torch.rand(x_.shape[:2], device=x_.device)
+            _, idx = torch.topk(rand, left_tokens, dim=1)  # [B, left_tokens]
+        else:
+            raise NotImplementedError(f"Sparse strategy {self.token_keep_strategy} is not implemented")
+        idx, _ = torch.sort(idx, dim=1)
+        index = idx.unsqueeze(-1).expand(-1, -1, x_.size(-1))  # [B, left_tokens, C]
+        outputs = torch.cat((x_cls, x_.gather(1, index)), dim=1).contiguous()
+        return outputs, idx
+    def forward(self, hidden_states, head_mask=None, output_attentions=False, relative_position_bias=None, token_idx=None):
+        self_attention_outputs = self.attention(
+            self.layernorm_before(hidden_states),  # in BEiT, layernorm is applied before self-attention
+            head_mask,
+            output_attentions=(output_attentions or self.token_keep_rate < 1),
+            relative_position_bias=relative_position_bias,
+            token_idx=token_idx
+        )
+        attention_output = self_attention_outputs[0]
+        outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+        # apply lambda_1 if present
+        if self.lambda_1 is not None:
+            attention_output = self.lambda_1 * attention_output
+        # first residual connection
+        hidden_states = self.drop_path(attention_output) + hidden_states
+        # in BEiT, layernorm is also applied after self-attention
+        layer_output = self.layernorm_after(hidden_states)
+        layer_output = self.intermediate(layer_output)
+        layer_output = self.output(layer_output)
+        if self.lambda_2 is not None:
+            layer_output = self.lambda_2 * layer_output
+        # second residual connection
+        layer_output = self.drop_path(layer_output) + hidden_states
+        # node sparsification
+        if self.token_keep_rate < 1:
+            layer_output, token_keep_idx = self.sparsify(layer_output, outputs[0])
+            if token_idx is not None:
+                if token_idx.ndim == 1:
+                    token_idx = repeat(token_idx, 'n -> b n', b=len(token_keep_idx))
+                token_keep_idx = token_idx.gather(1, token_keep_idx)
+            outputs = outputs + (token_keep_idx,)
+        outputs = (layer_output,) + outputs
+        return outputs
+class BeitRelativePositionBias(nn.Module):
+    def __init__(self, config, window_size):
+        super().__init__()
+        self.window_size = window_size
+        self.num_relative_distance = (2 * window_size[0] - 1) * (2 * window_size[1] - 1) + 3
+        self.relative_position_bias_table = nn.Parameter(
+            torch.zeros(self.num_relative_distance, config.num_attention_heads)
+        )  # 2*Wh-1 * 2*Ww-1, nH
+        # cls to token & token 2 cls & cls to cls
+        # get pair-wise relative position index for each token inside the window
+        coords_h = torch.arange(window_size[0])
+        coords_w = torch.arange(window_size[1])
+        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
+        coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
+        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Ww
+        relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
+        relative_coords[:, :, 0] += window_size[0] - 1  # shift to start from 0
+        relative_coords[:, :, 1] += window_size[1] - 1
+        relative_coords[:, :, 0] *= 2 * window_size[1] - 1
+        relative_position_index = torch.zeros(
+            size=(window_size[0] * window_size[1] + 1,) * 2, dtype=relative_coords.dtype
+        )
+        relative_position_index[1:, 1:] = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
+        relative_position_index[0, 0:] = self.num_relative_distance - 3
+        relative_position_index[0:, 0] = self.num_relative_distance - 2
+        relative_position_index[0, 0] = self.num_relative_distance - 1
+        self.register_buffer("relative_position_index", relative_position_index, persistent=False)
+    def forward(self):
+        relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
+            self.window_size[0] * self.window_size[1] + 1, self.window_size[0] * self.window_size[1] + 1, -1
+        )  # Wh*Ww,Wh*Ww,nH
+        return relative_position_bias.permute(2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
+class BeitRelativePositionBias3D(nn.Module):
+    """
+    3D relative position bias
+    """
+    def __init__(self, config, window_size, num_cls_tokens=1):
+        super().__init__()
+        self.window_size = window_size
+        self.num_cls_tokens = num_cls_tokens
+        relative_size = [w * 2 - 1 for w in window_size]
+        self.num_relative_distance = np.prod(relative_size) + 2 * num_cls_tokens + num_cls_tokens ** 2
+        self.relative_position_bias_table = nn.Parameter(
+            torch.zeros(self.num_relative_distance, config.num_attention_heads)
+        )
+        # get pair-wise relative position index for each token inside the window
+        coords_range = [torch.arange(w) for w in window_size]
+        coords_flatten = torch.stack(torch.meshgrid(coords_range)).flatten(1)
+        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]
+        relative_coords = relative_coords.permute(1, 2, 0).contiguous()
+        for i, w in enumerate(window_size):
+            relative_coords[:, :, i] += w - 1  # shift to start from 0
+        for i, r in enumerate(relative_size[1:]):
+            relative_coords[:, :, :i + 1] *= r
+        self.seq_len = np.prod(window_size) + num_cls_tokens
+        relative_position_index = torch.zeros((self.seq_len, self.seq_len), dtype=relative_coords.dtype)
+        relative_position_index[num_cls_tokens:, num_cls_tokens:] = relative_coords.sum(-1)
+        start = np.prod(relative_size)
+        cls2loc = torch.arange(num_cls_tokens).unsqueeze(1) + start
+        relative_position_index[:num_cls_tokens, num_cls_tokens:] = cls2loc
+        start += num_cls_tokens
+        loc2cls = torch.arange(num_cls_tokens).unsqueeze(0) + start
+        relative_position_index[num_cls_tokens:, :num_cls_tokens] = loc2cls
+        start += num_cls_tokens
+        cls2cls = torch.arange(num_cls_tokens ** 2).view(num_cls_tokens, num_cls_tokens) + start
+        relative_position_index[:num_cls_tokens, :num_cls_tokens] = cls2cls
+        self.register_buffer("relative_position_index", relative_position_index)
+    def forward(self, from_idx=None, to_idx=None):
+        """
+        from_idx: indices of query tokens (1-dim)
+        to_idx: indices of key/value tokens (1-dim, or 2-dim w/ one row per query)
+        """
+        attn_idx = self.relative_position_index
+        # query indices
+        if from_idx is not None:
+            attn_idx = attn_idx[from_idx]
+        # key indices
+        if to_idx is not None:
+            assert to_idx.ndim in (1, 2), "to_idx must be 1- or 2-dimensional tensors"
+            if to_idx.ndim == 1:
+                attn_idx = attn_idx[:, to_idx]
+            else:
+                attn_idx = attn_idx.gather(1, to_idx)
+        rows, cols = attn_idx.shape
+        relative_position_bias = self.relative_position_bias_table[attn_idx.flatten()]
+        relative_position_bias = rearrange(relative_position_bias, '(i j) h -> h i j', i=rows, j=cols)
+        return relative_position_bias.contiguous()
+class BeitEncoder(nn.Module):
+    def __init__(self, config, window_size=None):
+        super().__init__()
+        self.config = config
+        if config.use_shared_relative_position_bias:
+            self.relative_position_bias = BeitRelativePositionBias3D(config, window_size=window_size)
+        else:
+            self.relative_position_bias = None
+        self._register_token_order(window_size)
+        # stochastic depth decay rule
+        dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, config.num_hidden_layers)]
+        # node sparsification
+        token_keep_rate = [1] * config.num_hidden_layers
+        for loc in config.token_drop_loc:
+            token_keep_rate[loc] = config.token_keep_rate
+        self.layer = nn.ModuleList(
+            [
+                BeitLayer(
+                    config,
+                    window_size=window_size if config.use_relative_position_bias else None,
+                    drop_path_rate=dpr[i], token_keep_rate=token_keep_rate[i]
+                )
+                for i in range(config.num_hidden_layers)
+            ]
+        )
+        self.gradient_checkpointing = False
+    def _register_token_order(self, shape):
+        if self.config.token_3d_order == 'none':
+            order = None
+        elif self.config.token_3d_order == 'zcurve':
+            nbits = max(shape).bit_length()
+            coords = list(np.ndindex(*shape))
+            order = zCurve.par_interlace(coords, len(shape), nbits)
+            order = torch.tensor(np.argsort(order))
+        elif self.config.token_3d_order == 'hilbert':
+            nbits = max(shape).bit_length()
+            coords = list(np.ndindex(*shape))
+            order = hilbert.encode(np.stack(coords), len(shape), nbits)
+            order = torch.tensor(np.argsort(order))
+        else:
+            raise NotImplementedError(f"Token ordering {self.config.token_3d_order} not supported")
+        if order is not None:
+            self.register_buffer('token_order', order, persistent=False)
+        else:
+            self.token_order = None
+    def forward(
+        self,
+        hidden_states,
+        head_mask=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        output_token_idx=False,
+        return_dict=True,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_token_idx = () if output_token_idx else None
+        token_idx = self.token_order
+        if token_idx is not None:
+            cls_states, local_states = hidden_states[:, :self.config.num_cls_tokens], hidden_states[:, self.config.num_cls_tokens:]
+            local_states = torch.index_select(local_states, dim=1, index=token_idx)
+            hidden_states = torch.cat((cls_states, local_states), 1)
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            if self.gradient_checkpointing and self.training:
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, output_attentions)
+                    return custom_forward
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    layer_head_mask,
+                )
+            else:
+                relative_position_bias = (
+                    self.relative_position_bias() if self.relative_position_bias is not None else None
+                )
+                layer_outputs = layer_module(hidden_states, layer_head_mask, output_attentions, relative_position_bias, token_idx)
+            hidden_states = layer_outputs[0]
+            if layer_module.token_keep_rate < 1:
+                token_idx = layer_outputs[-1]
+                if output_token_idx:
+                    all_token_idx = all_token_idx + (token_idx,)
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+        return BeitModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            token_idx=all_token_idx
+        )
+class BeitPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+    config_class = BeitConfig
+    base_model_prefix = "beit"
+    supports_gradient_checkpointing = True
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Conv2d, nn.ConvTranspose2d)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, BeitEncoder):
+            module.gradient_checkpointing = value
+BEIT_START_DOCSTRING = r"""
+    This model is a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`_ subclass. Use
+    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+    Parameters:
+        config (:class:`~transformers.BeitConfig`): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+            weights.
+"""
+BEIT_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using :class:`~transformers.BeitFeatureExtractor`. See
+            :meth:`transformers.BeitFeatureExtractor.__call__` for details.
+        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        output_attentions (:obj:`bool`, `optional`):
+            Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+            tensors for more detail.
+        output_hidden_states (:obj:`bool`, `optional`):
+            Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+            more detail.
+        return_dict (:obj:`bool`, `optional`):
+            Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+"""
+@add_start_docstrings(
+    "The bare Beit Model transformer outputting raw hidden-states without any specific head on top.",
+    BEIT_START_DOCSTRING,
+)
+class BeitModel(BeitPreTrainedModel):
+    def __init__(self, config, add_pooling_layer=True, num_frames=None):
+        super().__init__(config)
+        self.config = config
+        self.embeddings = BeitEmbeddings(config)
+        self.window_size = self.embeddings.patch_embeddings.patch_shape
+        if num_frames is not None:
+            self.window_size = (num_frames,) + self.window_size
+        self.encoder = BeitEncoder(config, window_size=self.window_size)
+        self.layernorm = (
+            nn.Identity() if config.use_mean_pooling else nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        )
+        self.pooler = BeitPooler(config) if add_pooling_layer else None
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.embeddings.patch_embeddings
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+    @add_start_docstrings_to_model_forward(BEIT_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BeitModelOutputWithPooling, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values=None,
+        bool_masked_pos=None,
+        head_mask=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        output_token_idx=None,
+        return_dict=None,
+    ):
+        r"""
+        Returns:
+        Examples::
+            >>> from transformers import BeitFeatureExtractor, BeitModel
+            >>> from PIL import Image
+            >>> import requests
+            >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
+            >>> image = Image.open(requests.get(url, stream=True).raw)
+            >>> feature_extractor = BeitFeatureExtractor.from_pretrained('microsoft/beit-base-patch16-224-pt22k-ft22k')
+            >>> model = BeitModel.from_pretrained('microsoft/beit-base-patch16-224-pt22k-ft22k')
+            >>> inputs = feature_extractor(images=image, return_tensors="pt")
+            >>> outputs = model(**inputs)
+            >>> last_hidden_states = outputs.last_hidden_state
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+        embedding_output = self.embeddings(pixel_values, bool_masked_pos)
+        encoder_outputs = self.encoder(
+            embedding_output,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            output_token_idx=output_token_idx,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+        sequence_output = self.layernorm(sequence_output)
+        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+        return BeitModelOutputWithPooling(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            token_idx=encoder_outputs.token_idx,
+        )
+class BeitPooler(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.layernorm = (
+            nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) if config.use_mean_pooling else None
+        )
+    def forward(self, hidden_states):
+        if self.layernorm is not None:
+            # Mean pool the final hidden states of the patch tokens
+            patch_tokens = hidden_states[:, 1:, :]
+            pooled_output = self.layernorm(patch_tokens.mean(1))
+        else:
+            # Pool by simply taking the final hidden state of the [CLS] token
+            pooled_output = hidden_states[:, 0]
+        return pooled_output
+@add_start_docstrings(
+    "Beit Model transformer with a 'language' modeling head on top (to predict visual tokens).",
+    BEIT_START_DOCSTRING,
+)
+class BeitForMaskedImageModeling(BeitPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.beit = BeitModel(config, add_pooling_layer=False)
+        # Classifier head
+        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size)
+        # Initialize weights and apply final processing
+        self.post_init()
+    @add_start_docstrings_to_model_forward(BEIT_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=MaskedLMOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values=None,
+        bool_masked_pos=None,
+        head_mask=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        bool_masked_pos (:obj:`torch.BoolTensor` of shape :obj:`(batch_size, num_patches)`):
+            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for computing the image classification/regression loss. Indices should be in :obj:`[0, ...,
+            config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        Returns:
+        Examples::
+            >>> from transformers import BeitFeatureExtractor, BeitForMaskedImageModeling
+            >>> from PIL import Image
+            >>> import requests
+            >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
+            >>> image = Image.open(requests.get(url, stream=True).raw)
+            >>> feature_extractor = BeitFeatureExtractor.from_pretrained('microsoft/beit-base-patch16-224-pt22k')
+            >>> model = BeitForMaskedImageModeling.from_pretrained('microsoft/beit-base-patch16-224-pt22k')
+            >>> inputs = feature_extractor(images=image, return_tensors="pt")
+            >>> outputs = model(**inputs)
+            >>> logits = outputs.logits
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.beit(
+            pixel_values,
+            bool_masked_pos=bool_masked_pos,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+        sequence_output = self.layernorm(sequence_output)
+        prediction_scores = self.lm_head(sequence_output[:, 1:])
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()  # -100 index = padding token
+            masked_lm_loss = loss_fct(prediction_scores[bool_masked_pos], labels)
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+        return MaskedLMOutput(
+            loss=masked_lm_loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+@add_start_docstrings(
+    """
+    Beit Model transformer with an image classification head on top (a linear layer on top of the average of the final
+    hidden states of the patch tokens) e.g. for ImageNet.
+    """,
+    BEIT_START_DOCSTRING,
+)
+class BeitForImageClassification(BeitPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.beit = BeitModel(config, add_pooling_layer=True)
+        # Classifier head
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels) if config.num_labels > 0 else nn.Identity()
+        # Initialize weights and apply final processing
+        self.post_init()
+    @add_start_docstrings_to_model_forward(BEIT_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=SequenceClassifierOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values=None,
+        head_mask=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for computing the image classification/regression loss. Indices should be in :obj:`[0, ...,
+            config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        Returns:
+        Examples::
+            >>> from transformers import BeitFeatureExtractor, BeitForImageClassification
+            >>> from PIL import Image
+            >>> import requests
+            >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
+            >>> image = Image.open(requests.get(url, stream=True).raw)
+            >>> feature_extractor = BeitFeatureExtractor.from_pretrained('microsoft/beit-base-patch16-224')
+            >>> model = BeitForImageClassification.from_pretrained('microsoft/beit-base-patch16-224')
+            >>> inputs = feature_extractor(images=image, return_tensors="pt")
+            >>> outputs = model(**inputs)
+            >>> logits = outputs.logits
+            >>> # model predicts one of the 1000 ImageNet classes
+            >>> predicted_class_idx = logits.argmax(-1).item()
+            >>> print("Predicted class:", model.config.id2label[predicted_class_idx])
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.beit(
+            pixel_values,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        pooled_output = outputs.pooler_output if return_dict else outputs[1]
+        logits = self.classifier(pooled_output)
+        loss = None
+        if labels is not None:
+            if self.num_labels == 1:
+                #  We are doing regression
+                loss_fct = MSELoss()
+                loss = loss_fct(logits.view(-1), labels.view(-1))
+            else:
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+class BeitConvModule(nn.Module):
+    """
+    A convolutional block that bundles conv/norm/activation layers. This block simplifies the usage of convolution
+    layers, which are commonly used with a norm layer (e.g., BatchNorm) and activation layer (e.g., ReLU).
+    Based on OpenMMLab's implementation, found in https://github.com/open-mmlab/mmsegmentation.
+    """
+    def __init__(self, in_channels, out_channels, kernel_size, padding=0, bias=False, dilation=1):
+        super().__init__()
+        self.conv = nn.Conv2d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            padding=padding,
+            bias=bias,
+            dilation=dilation,
+        )
+        self.bn = nn.BatchNorm2d(out_channels)
+        self.activation = nn.ReLU()
+    def forward(self, input):
+        output = self.conv(input)
+        output = self.bn(output)
+        output = self.activation(output)
+        return output
+class BeitPyramidPoolingModule(nn.ModuleList):
+    """
+    Pyramid Pooling Module (PPM) used in PSPNet.
+    Args:
+        pool_scales (tuple[int]): Pooling scales used in Pooling Pyramid
+            Module.
+        in_channels (int): Input channels.
+        channels (int): Channels after modules, before conv_seg.
+        align_corners (bool): align_corners argument of F.interpolate.
+    Based on OpenMMLab's implementation, found in https://github.com/open-mmlab/mmsegmentation.
+    """
+    def __init__(self, pool_scales, in_channels, channels, align_corners):
+        super().__init__()
+        self.pool_scales = pool_scales
+        self.align_corners = align_corners
+        self.in_channels = in_channels
+        self.channels = channels
+        for pool_scale in pool_scales:
+            self.append(
+                nn.Sequential(
+                    nn.AdaptiveAvgPool2d(pool_scale),
+                    BeitConvModule(self.in_channels, self.channels, kernel_size=1),
+                )
+            )
+    def forward(self, x):
+        ppm_outs = []
+        for ppm in self:
+            ppm_out = ppm(x)
+            upsampled_ppm_out = nn.functional.interpolate(
+                ppm_out, size=x.size()[2:], mode="bilinear", align_corners=self.align_corners
+            )
+            ppm_outs.append(upsampled_ppm_out)
+        return ppm_outs
+class BeitUperHead(nn.Module):
+    """
+    Unified Perceptual Parsing for Scene Understanding. This head is the implementation of `UPerNet
+    <https://arxiv.org/abs/1807.10221>`_.
+    Based on OpenMMLab's implementation, found in https://github.com/open-mmlab/mmsegmentation.
+    """
+    def __init__(self, config):
+        super().__init__()
+        self.pool_scales = config.pool_scales  # e.g. (1, 2, 3, 6)
+        self.in_channels = [config.hidden_size] * 4  # e.g. [768, 768, 768, 768]
+        self.channels = config.hidden_size
+        self.align_corners = False
+        self.classifier = nn.Conv2d(self.channels, config.num_labels, kernel_size=1)
+        # PSP Module
+        self.psp_modules = BeitPyramidPoolingModule(
+            self.pool_scales,
+            self.in_channels[-1],
+            self.channels,
+            align_corners=self.align_corners,
+        )
+        self.bottleneck = BeitConvModule(
+            self.in_channels[-1] + len(self.pool_scales) * self.channels,
+            self.channels,
+            kernel_size=3,
+            padding=1,
+        )
+        # FPN Module
+        self.lateral_convs = nn.ModuleList()
+        self.fpn_convs = nn.ModuleList()
+        for in_channels in self.in_channels[:-1]:  # skip the top layer
+            l_conv = BeitConvModule(in_channels, self.channels, kernel_size=1)
+            fpn_conv = BeitConvModule(self.channels, self.channels, kernel_size=3, padding=1)
+            self.lateral_convs.append(l_conv)
+            self.fpn_convs.append(fpn_conv)
+        self.fpn_bottleneck = BeitConvModule(
+            len(self.in_channels) * self.channels,
+            self.channels,
+            kernel_size=3,
+            padding=1,
+        )
+    def psp_forward(self, inputs):
+        x = inputs[-1]
+        psp_outs = [x]
+        psp_outs.extend(self.psp_modules(x))
+        psp_outs = torch.cat(psp_outs, dim=1)
+        output = self.bottleneck(psp_outs)
+        return output
+    def forward(self, encoder_hidden_states):
+        # build laterals
+        laterals = [lateral_conv(encoder_hidden_states[i]) for i, lateral_conv in enumerate(self.lateral_convs)]
+        laterals.append(self.psp_forward(encoder_hidden_states))
+        # build top-down path
+        used_backbone_levels = len(laterals)
+        for i in range(used_backbone_levels - 1, 0, -1):
+            prev_shape = laterals[i - 1].shape[2:]
+            laterals[i - 1] = laterals[i - 1] + nn.functional.interpolate(
+                laterals[i], size=prev_shape, mode="bilinear", align_corners=self.align_corners
+            )
+        # build outputs
+        fpn_outs = [self.fpn_convs[i](laterals[i]) for i in range(used_backbone_levels - 1)]
+        # append psp feature
+        fpn_outs.append(laterals[-1])
+        for i in range(used_backbone_levels - 1, 0, -1):
+            fpn_outs[i] = nn.functional.interpolate(
+                fpn_outs[i], size=fpn_outs[0].shape[2:], mode="bilinear", align_corners=self.align_corners
+            )
+        fpn_outs = torch.cat(fpn_outs, dim=1)
+        output = self.fpn_bottleneck(fpn_outs)
+        output = self.classifier(output)
+        return output
+class BeitFCNHead(nn.Module):
+    """
+    Fully Convolution Networks for Semantic Segmentation. This head is implemented of `FCNNet
+    <https://arxiv.org/abs/1411.4038>`_.
+    Args:
+        config (BeitConfig): Configuration.
+        in_channels
+        kernel_size (int): The kernel size for convs in the head. Default: 3.
+        dilation (int): The dilation rate for convs in the head. Default: 1.
+    Based on OpenMMLab's implementation, found in https://github.com/open-mmlab/mmsegmentation.
+    """
+    def __init__(self, config, in_index=2, kernel_size=3, dilation=1):
+        super().__init__()
+        self.in_channels = config.hidden_size
+        self.channels = config.auxiliary_channels
+        self.num_convs = config.auxiliary_num_convs
+        self.concat_input = config.auxiliary_concat_input
+        self.in_index = in_index
+        conv_padding = (kernel_size // 2) * dilation
+        convs = []
+        convs.append(
+            BeitConvModule(
+                self.in_channels, self.channels, kernel_size=kernel_size, padding=conv_padding, dilation=dilation
+            )
+        )
+        for i in range(self.num_convs - 1):
+            convs.append(
+                BeitConvModule(
+                    self.channels, self.channels, kernel_size=kernel_size, padding=conv_padding, dilation=dilation
+                )
+            )
+        if self.num_convs == 0:
+            self.convs = nn.Identity()
+        else:
+            self.convs = nn.Sequential(*convs)
+        if self.concat_input:
+            self.conv_cat = BeitConvModule(
+                self.in_channels + self.channels, self.channels, kernel_size=kernel_size, padding=kernel_size // 2
+            )
+        self.classifier = nn.Conv2d(self.channels, config.num_labels, kernel_size=1)
+    def forward(self, encoder_hidden_states):
+        # just take the relevant feature maps
+        hidden_states = encoder_hidden_states[self.in_index]
+        output = self.convs(hidden_states)
+        if self.concat_input:
+            output = self.conv_cat(torch.cat([hidden_states, output], dim=1))
+        output = self.classifier(output)
+        return output
+@add_start_docstrings(
+    """
+    Beit Model transformer with a semantic segmentation head on top e.g. for ADE20k, CityScapes.
+    """,
+    BEIT_START_DOCSTRING,
+)
+class BeitForSemanticSegmentation(BeitPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.beit = BeitModel(config, add_pooling_layer=False)
+        # FPNs
+        self.fpn1 = nn.Sequential(
+            nn.ConvTranspose2d(config.hidden_size, config.hidden_size, kernel_size=2, stride=2),
+            nn.BatchNorm2d(config.hidden_size),
+            nn.GELU(),
+            nn.ConvTranspose2d(config.hidden_size, config.hidden_size, kernel_size=2, stride=2),
+        )
+        self.fpn2 = nn.Sequential(
+            nn.ConvTranspose2d(config.hidden_size, config.hidden_size, kernel_size=2, stride=2),
+        )
+        self.fpn3 = nn.Identity()
+        self.fpn4 = nn.MaxPool2d(kernel_size=2, stride=2)
+        # Semantic segmentation head(s)
+        self.decode_head = BeitUperHead(config)
+        self.auxiliary_head = BeitFCNHead(config) if config.use_auxiliary_head else None
+        # Initialize weights and apply final processing
+        self.post_init()
+    def compute_loss(self, logits, auxiliary_logits, labels):
+        # upsample logits to the images' original size
+        upsampled_logits = nn.functional.interpolate(
+            logits, size=labels.shape[-2:], mode="bilinear", align_corners=False
+        )
+        if auxiliary_logits is not None:
+            upsampled_auxiliary_logits = nn.functional.interpolate(
+                auxiliary_logits, size=labels.shape[-2:], mode="bilinear", align_corners=False
+            )
+        # compute weighted loss
+        loss_fct = CrossEntropyLoss(ignore_index=self.config.semantic_loss_ignore_index)
+        main_loss = loss_fct(upsampled_logits, labels)
+        auxiliary_loss = loss_fct(upsampled_auxiliary_logits, labels)
+        loss = main_loss + self.config.auxiliary_loss_weight * auxiliary_loss
+        return loss
+    @add_start_docstrings_to_model_forward(BEIT_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=SequenceClassifierOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values=None,
+        head_mask=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, height, width)`, `optional`):
+            Ground truth semantic segmentation maps for computing the loss. Indices should be in :obj:`[0, ...,
+            config.num_labels - 1]`. If :obj:`config.num_labels > 1`, a classification loss is computed
+            (Cross-Entropy).
+        Returns:
+        Examples::
+            >>> from transformers import BeitFeatureExtractor, BeitForSemanticSegmentation
+            >>> from PIL import Image
+            >>> import requests
+            >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
+            >>> image = Image.open(requests.get(url, stream=True).raw)
+            >>> feature_extractor = BeitFeatureExtractor.from_pretrained('microsoft/beit-base-finetuned-ade-640-640')
+            >>> model = BeitForSemanticSegmentation.from_pretrained('microsoft/beit-base-finetuned-ade-640-640')
+            >>> inputs = feature_extractor(images=image, return_tensors="pt")
+            >>> outputs = model(**inputs)
+            >>> # logits are of shape (batch_size, num_labels, height/4, width/4)
+            >>> logits = outputs.logits
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        outputs = self.beit(
+            pixel_values,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=True,  # we need the intermediate hidden states
+            return_dict=return_dict,
+        )
+        encoder_hidden_states = outputs.hidden_states if return_dict else outputs[2]
+        # only keep certain features, and reshape
+        # note that we do +1 as the encoder_hidden_states also includes the initial embeddings
+        features = [feature for idx, feature in enumerate(encoder_hidden_states) if idx + 1 in self.config.out_indices]
+        batch_size = pixel_values.shape[0]
+        patch_resolution = self.config.image_size // self.config.patch_size
+        features = [
+            x[:, 1:, :].permute(0, 2, 1).reshape(batch_size, -1, patch_resolution, patch_resolution) for x in features
+        ]
+        # apply FPNs
+        ops = [self.fpn1, self.fpn2, self.fpn3, self.fpn4]
+        for i in range(len(features)):
+            features[i] = ops[i](features[i])
+        logits = self.decode_head(features)
+        auxiliary_logits = None
+        if self.auxiliary_head is not None:
+            auxiliary_logits = self.auxiliary_head(features)
+        loss = None
+        if labels is not None:
+            if self.config.num_labels == 1:
+                raise ValueError("The number of labels should be greater than one")
+            else:
+                loss = self.compute_loss(logits, auxiliary_logits, labels)
+        if not return_dict:
+            if output_hidden_states:
+                output = (logits,) + outputs[2:]
+            else:
+                output = (logits,) + outputs[3:]
+            return ((loss,) + output) if loss is not None else output
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states if output_hidden_states else None,
+            attentions=outputs.attentions,
+        )

svitt/sparse_xbert.py ADDED Viewed

	@@ -0,0 +1,2039 @@

+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch BERT model. """
+import math
+import os
+import warnings
+from dataclasses import dataclass
+from typing import Optional, Tuple
+import torch
+from torch import Tensor, device, nn
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import CrossEntropyLoss, MSELoss
+import torch.nn.functional as F
+from transformers.activations import ACT2FN
+from transformers.file_utils import (
+    ModelOutput,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    replace_return_docstrings,
+)
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    BaseModelOutputWithPoolingAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions,
+    MaskedLMOutput,
+    MultipleChoiceModelOutput,
+    NextSentencePredictorOutput,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+)
+from transformers.modeling_utils import (
+    PreTrainedModel,
+    apply_chunking_to_forward,
+    find_pruneable_heads_and_indices,
+    prune_linear_layer,
+)
+from svitt.sparse_config import BertConfig
+import transformers
+transformers.logging.set_verbosity_error()
+_CONFIG_FOR_DOC = "BertConfig"
+_TOKENIZER_FOR_DOC = "BertTokenizer"
+BERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "bert-base-uncased",
+    "bert-large-uncased",
+    "bert-base-cased",
+    "bert-large-cased",
+    "bert-base-multilingual-uncased",
+    "bert-base-multilingual-cased",
+    "bert-base-chinese",
+    "bert-base-german-cased",
+    "bert-large-uncased-whole-word-masking",
+    "bert-large-cased-whole-word-masking",
+    "bert-large-uncased-whole-word-masking-finetuned-squad",
+    "bert-large-cased-whole-word-masking-finetuned-squad",
+    "bert-base-cased-finetuned-mrpc",
+    "bert-base-german-dbmdz-cased",
+    "bert-base-german-dbmdz-uncased",
+    "cl-tohoku/bert-base-japanese",
+    "cl-tohoku/bert-base-japanese-whole-word-masking",
+    "cl-tohoku/bert-base-japanese-char",
+    "cl-tohoku/bert-base-japanese-char-whole-word-masking",
+    "TurkuNLP/bert-base-finnish-cased-v1",
+    "TurkuNLP/bert-base-finnish-uncased-v1",
+    "wietsedv/bert-base-dutch-cased",
+    # See all BERT models at https://huggingface.co/models?filter=bert
+]
+@dataclass
+class BertModelOutputWithPastAndCrossAttentions(BaseModelOutputWithPastAndCrossAttentions):
+    token_idx: Optional[Tuple[torch.LongTensor]] = None
+@dataclass
+class BertModelOutputWithPoolingAndCrossAttentions(BaseModelOutputWithPoolingAndCrossAttentions):
+    token_idx: Optional[Tuple[torch.LongTensor]] = None
+def load_tf_weights_in_bert(model, config, tf_checkpoint_path):
+    """Load tf checkpoints in a pytorch model."""
+    try:
+        import re
+        import numpy as np
+        import tensorflow as tf
+    except ImportError:
+        print(
+            "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
+            "https://www.tensorflow.org/install/ for installation instructions."
+        )
+        raise
+    tf_path = os.path.abspath(tf_checkpoint_path)
+    # Load weights from TF model
+    init_vars = tf.train.list_variables(tf_path)
+    names = []
+    arrays = []
+    for name, shape in init_vars:
+        array = tf.train.load_variable(tf_path, name)
+        names.append(name)
+        arrays.append(array)
+    for name, array in zip(names, arrays):
+        name = name.split("/")
+        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
+        # which are not required for using pretrained model
+        if any(
+            n in ["adam_v", "adam_m", "AdamWeightDecayOptimizer",
+                  "AdamWeightDecayOptimizer_1", "global_step"]
+            for n in name
+        ):
+            continue
+        pointer = model
+        for m_name in name:
+            if re.fullmatch(r"[A-Za-z]+_\d+", m_name):
+                scope_names = re.split(r"_(\d+)", m_name)
+            else:
+                scope_names = [m_name]
+            if scope_names[0] == "kernel" or scope_names[0] == "gamma":
+                pointer = getattr(pointer, "weight")
+            elif scope_names[0] == "output_bias" or scope_names[0] == "beta":
+                pointer = getattr(pointer, "bias")
+            elif scope_names[0] == "output_weights":
+                pointer = getattr(pointer, "weight")
+            elif scope_names[0] == "squad":
+                pointer = getattr(pointer, "classifier")
+            else:
+                try:
+                    pointer = getattr(pointer, scope_names[0])
+                except AttributeError:
+                    continue
+            if len(scope_names) >= 2:
+                num = int(scope_names[1])
+                pointer = pointer[num]
+        if m_name[-11:] == "_embeddings":
+            pointer = getattr(pointer, "weight")
+        elif m_name == "kernel":
+            array = np.transpose(array)
+        try:
+            assert (
+                pointer.shape == array.shape
+            ), f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched"
+        except AssertionError as e:
+            e.args += (pointer.shape, array.shape)
+            raise
+        pointer.data = torch.from_numpy(array)
+    return model
+class BertEmbeddings(nn.Module):
+    """Construct the embeddings from word, position and token_type embeddings."""
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(
+            config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(
+            config.max_position_embeddings, config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(
+            config.type_vocab_size, config.hidden_size)
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.register_buffer("position_ids", torch.arange(
+            config.max_position_embeddings).expand((1, -1)))
+        self.position_embedding_type = getattr(
+            config, "position_embedding_type", "absolute")
+        self.config = config
+    def forward(
+        self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0
+    ):
+        if input_ids is not None:
+            input_shape = input_ids.size()
+        else:
+            input_shape = inputs_embeds.size()[:-1]
+        seq_length = input_shape[1]
+        if position_ids is None:
+            position_ids = self.position_ids[:,
+                                             past_key_values_length: seq_length + past_key_values_length]
+        if token_type_ids is None:
+            token_type_ids = torch.zeros(
+                input_shape, dtype=torch.long, device=self.position_ids.device)
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+        embeddings = inputs_embeds + token_type_embeddings
+        if self.position_embedding_type == "absolute":
+            position_embeddings = self.position_embeddings(position_ids)
+            embeddings += position_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+class BertSelfAttention(nn.Module):
+    def __init__(self, config, is_cross_attention):
+        super().__init__()
+        self.config = config
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                "The hidden size (%d) is not a multiple of the number of attention "
+                "heads (%d)" % (config.hidden_size, config.num_attention_heads)
+            )
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(
+            config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        if is_cross_attention:
+            self.key = nn.Linear(config.encoder_width, self.all_head_size)
+            self.value = nn.Linear(config.encoder_width, self.all_head_size)
+        else:
+            self.key = nn.Linear(config.hidden_size, self.all_head_size)
+            self.value = nn.Linear(config.hidden_size, self.all_head_size)
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = getattr(
+            config, "position_embedding_type", "absolute")
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = nn.Embedding(
+                2 * config.max_position_embeddings - 1, self.attention_head_size)
+        self.save_attention = False
+    def save_attn_gradients(self, attn_gradients):
+        self.attn_gradients = attn_gradients
+    def get_attn_gradients(self):
+        return self.attn_gradients
+    def save_attention_map(self, attention_map):
+        self.attention_map = attention_map
+    def get_attention_map(self):
+        return self.attention_map
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[
+            :-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        mixed_query_layer = self.query(hidden_states)
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+        if is_cross_attention:
+            key_layer = self.transpose_for_scores(
+                self.key(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(
+                self.value(encoder_hidden_states))
+            attention_mask = encoder_attention_mask
+        elif past_key_value is not None:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+            key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
+            value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
+        else:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+        past_key_value = (key_layer, value_layer)
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(
+            query_layer, key_layer.transpose(-1, -2))
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            seq_length = hidden_states.size()[1]
+            position_ids_l = torch.arange(
+                seq_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
+            position_ids_r = torch.arange(
+                seq_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
+            distance = position_ids_l - position_ids_r
+            positional_embedding = self.distance_embedding(
+                distance + self.max_position_embeddings - 1)
+            positional_embedding = positional_embedding.to(
+                dtype=query_layer.dtype)  # fp16 compatibility
+            if self.position_embedding_type == "relative_key":
+                relative_position_scores = torch.einsum(
+                    "bhld,lrd->bhlr", query_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == "relative_key_query":
+                relative_position_scores_query = torch.einsum(
+                    "bhld,lrd->bhlr", query_layer, positional_embedding)
+                relative_position_scores_key = torch.einsum(
+                    "bhrd,lrd->bhlr", key_layer, positional_embedding)
+                attention_scores = attention_scores + \
+                    relative_position_scores_query + relative_position_scores_key
+        attention_scores = attention_scores / \
+            math.sqrt(self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+            attention_scores = attention_scores + attention_mask
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.Softmax(dim=-1)(attention_scores)
+        if is_cross_attention and self.save_attention:
+            self.save_attention_map(attention_probs)
+            attention_probs.register_hook(self.save_attn_gradients)
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs_dropped = self.dropout(attention_probs)
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs_dropped = attention_probs_dropped * head_mask
+        context_layer = torch.matmul(attention_probs_dropped, value_layer)
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[
+            :-2] + (self.all_head_size,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+        # added `attention_scores` to return tuple
+        outputs = (context_layer, attention_probs, attention_scores) if output_attentions else (
+            context_layer,)
+        outputs = outputs + (past_key_value,)
+        return outputs
+class BertSelfOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+class BertAttention(nn.Module):
+    def __init__(self, config, is_cross_attention=False):
+        super().__init__()
+        self.self = BertSelfAttention(config, is_cross_attention)
+        self.output = BertSelfOutput(config)
+        self.pruned_heads = set()
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
+        )
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - \
+            len(heads)
+        self.self.all_head_size = self.self.attention_head_size * \
+            self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        self_outputs = self.self(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            past_key_value,
+            output_attentions,
+        )
+        attention_output = self.output(self_outputs[0], hidden_states)
+        # add attentions if we output them
+        outputs = (attention_output,) + self_outputs[1:]
+        return outputs  # (context_layer, attention_probs, attention_scores, past_key_value,)
+class BertIntermediate(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+class BertOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+class BertLayer(nn.Module):
+    def __init__(self, config, layer_num, token_keep_rate=1.0):
+        super().__init__()
+        self.config = config
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = BertAttention(config)
+        self.has_cross_attention = (layer_num >= config.fusion_layer)
+        if self.has_cross_attention:
+            self.layer_num = layer_num
+            self.crossattention = BertAttention(
+                config, is_cross_attention=True)
+            # sparse params
+            self.token_keep_rate = token_keep_rate
+            self.token_keep_strategy = config.token_keep_strategy
+            self.encoder_num_cls_tokens = 1  # multiple cls tokens
+        self.intermediate = BertIntermediate(config)
+        self.output = BertOutput(config)
+    def sparsify(self, x, attn, mask=None):
+        x_cls, x_ = x[:, :self.encoder_num_cls_tokens], x[:, self.encoder_num_cls_tokens:]
+        assert 0 < self.token_keep_rate <= 1, "Expected keep rate in range (0, 1]"
+        left_tokens = math.ceil(self.token_keep_rate * x_.size(1))
+        if len(attn.shape) == 4:
+            attn = attn.mean(1)  # pool over attention heads
+        if self.token_keep_strategy == 'cls_attn':
+            cls_attn = attn[:, 0, self.encoder_num_cls_tokens:]
+            _, idx = torch.topk(cls_attn, left_tokens, dim=1)  # [B, left_tokens]
+        elif self.token_keep_strategy == 'avg_attn':
+            avg_attn = attn.mean(1)[:, self.encoder_num_cls_tokens:]
+            _, idx = torch.topk(avg_attn, left_tokens, dim=1)  # [B, left_tokens]
+        elif self.token_keep_strategy == 'random':
+            rand = torch.rand(x_.shape[:2], device=x_.device)
+            _, idx = torch.topk(rand, left_tokens, dim=1)  # [B, left_tokens]
+        else:
+            raise NotImplementedError(f"Sparse strategy {self.token_keep_strategy} is not implemented")
+        idx, _ = torch.sort(idx, dim=1)
+        index = idx.unsqueeze(-1).expand(-1, -1, x_.size(-1))  # [B, left_tokens, C]
+        outputs = torch.cat((x_cls, x_.gather(1, index)), dim=1).contiguous()
+        if mask is not None:
+            mask_cls, mask_ = mask[..., :self.encoder_num_cls_tokens], mask[..., self.encoder_num_cls_tokens:]
+            index = idx.unsqueeze(1).unsqueeze(1)  # [B, 1, 1, left_tokens]
+            mask = torch.cat((mask_cls, mask_.gather(-1, index)), dim=-1).contiguous()
+        return outputs, mask, idx
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:
+                                                  2] if past_key_value is not None else None
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            output_attentions=output_attentions,
+            past_key_value=self_attn_past_key_value,
+        )  # (context_layer, attention_probs, attention_scores, past_key_value,)
+        attention_output = self_attention_outputs[0]
+        outputs = self_attention_outputs[1:-1]
+        present_key_value = self_attention_outputs[-1]
+        if self.has_cross_attention:
+            assert encoder_hidden_states is not None, "encoder_hidden_states must be given for cross-attention layers"
+            output_attentions = (output_attentions or self.token_keep_rate < 1)
+            if type(encoder_hidden_states) == list:
+                cross_attention_outputs = self.crossattention(
+                    attention_output,
+                    attention_mask,
+                    head_mask,
+                    encoder_hidden_states[(
+                        self.layer_num-self.config.fusion_layer) % len(encoder_hidden_states)],
+                    encoder_attention_mask[(
+                        self.layer_num-self.config.fusion_layer) % len(encoder_hidden_states)],
+                    output_attentions=output_attentions,
+                )
+                attention_output = cross_attention_outputs[0]
+                outputs = outputs + cross_attention_outputs[1:-1]
+            else:
+                cross_attention_outputs = self.crossattention(
+                    attention_output,
+                    attention_mask,
+                    head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    output_attentions=output_attentions,
+                )  # (context_layer, attention_probs, attention_scores, past_key_value,)
+                attention_output = cross_attention_outputs[0]
+                # add cross attentions if we output attention weights
+                outputs = outputs + cross_attention_outputs[1:-1]
+                # node sparsification
+                if self.token_keep_rate < 1:
+                    encoder_hidden_states, encoder_attention_mask, token_keep_idx = self.sparsify(
+                        encoder_hidden_states, cross_attention_outputs[1], encoder_attention_mask)
+                    outputs = outputs + (encoder_hidden_states, encoder_attention_mask, token_keep_idx)
+        layer_output = apply_chunking_to_forward(
+            self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
+        )
+        outputs = (layer_output,) + outputs
+        outputs = outputs + (present_key_value,)
+        return outputs
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+class BertEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        # node sparsification
+        token_keep_rate = [1] * config.num_hidden_layers
+        for loc in config.token_drop_loc:
+            token_keep_rate[loc] = config.token_keep_rate
+        self.layer = nn.ModuleList([BertLayer(config, i, token_keep_rate[i])
+                                   for i in range(config.num_hidden_layers)])
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        output_token_idx=False,
+        return_dict=True,
+        mode='multi_modal',
+        normalize_attention=True
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = () if output_attentions else None
+        all_token_idx = () if output_token_idx else None
+        next_decoder_cache = () if use_cache else None
+        if mode == 'text':
+            start_layer = 0
+            output_layer = self.config.fusion_layer
+        elif mode == 'fusion':
+            start_layer = self.config.fusion_layer
+            output_layer = self.config.num_hidden_layers
+        elif mode == 'multi_modal':
+            start_layer = 0
+            output_layer = self.config.num_hidden_layers
+        for i in range(start_layer, output_layer):
+            layer_module = self.layer[i]
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            past_key_value = past_key_values[i] if past_key_values is not None else None
+            if getattr(self.config, "gradient_checkpointing", False) and self.training:
+                if use_cache:
+                    use_cache = False
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, past_key_value, output_attentions)
+                    return custom_forward
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    past_key_value,
+                    output_attentions,
+                )  # (context_layer, attention_probs, attention_scores, past_key_value,)
+            hidden_states = layer_outputs[0]
+            # update visual sequence
+            if mode == 'fusion' and layer_module.token_keep_rate < 1:
+                encoder_hidden_states, encoder_attention_mask, token_idx = layer_outputs[-4:-1]
+                if output_token_idx:
+                    all_token_idx = all_token_idx + (token_idx,)
+            if use_cache:
+                next_decoder_cache += (layer_outputs[-1],)
+            if output_attentions:
+                # whether to output normalized attention,
+                # note for unnormalized attention, there is a mask added
+                offset = int(normalize_attention)
+                all_self_attentions = all_self_attentions + (layer_outputs[2-offset], )
+                if hasattr(layer_module, "crossattention"):
+                    all_cross_attentions = all_cross_attentions + (layer_outputs[4-offset], )
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    next_decoder_cache,
+                    all_hidden_states,
+                    all_self_attentions,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        return BertModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_decoder_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+            token_idx=all_token_idx
+        )
+class BertPooler(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+class BertPredictionHeadTransform(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        if isinstance(config.hidden_act, str):
+            self.transform_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.transform_act_fn = config.hidden_act
+        self.LayerNorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps)
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+class BertLMPredictionHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.transform = BertPredictionHeadTransform(config)
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = nn.Linear(
+            config.hidden_size, config.vocab_size, bias=False)
+        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
+        self.decoder.bias = self.bias
+    def forward(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.decoder(hidden_states)
+        return hidden_states
+class BertOnlyMLMHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = BertLMPredictionHead(config)
+    def forward(self, sequence_output):
+        prediction_scores = self.predictions(sequence_output)
+        return prediction_scores
+class BertOnlyNSPHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+    def forward(self, pooled_output):
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return seq_relationship_score
+class BertPreTrainingHeads(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = BertLMPredictionHead(config)
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+    def forward(self, sequence_output, pooled_output):
+        prediction_scores = self.predictions(sequence_output)
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return prediction_scores, seq_relationship_score
+class BertPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+    config_class = BertConfig
+    load_tf_weights = load_tf_weights_in_bert
+    base_model_prefix = "bert"
+    _keys_to_ignore_on_load_missing = [r"position_ids"]
+    def _init_weights(self, module):
+        """ Initialize the weights """
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(
+                mean=0.0, std=self.config.initializer_range)
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        if isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.data.zero_()
+@dataclass
+class BertForPreTrainingOutput(ModelOutput):
+    """
+    Output type of :class:`~transformers.BertForPreTraining`.
+    Args:
+        loss (`optional`, returned when ``labels`` is provided, ``torch.FloatTensor`` of shape :obj:`(1,)`):
+            Total loss as the sum of the masked language modeling loss and the next sequence prediction
+            (classification) loss.
+        prediction_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        seq_relationship_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2)`):
+            Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
+            before SoftMax).
+        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
+            sequence_length, sequence_length)`.
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+    loss: Optional[torch.FloatTensor] = None
+    prediction_logits: torch.FloatTensor = None
+    seq_relationship_logits: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+BERT_START_DOCSTRING = r"""
+    This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
+    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
+    pruning heads etc.)
+    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
+    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
+    general usage and behavior.
+    Parameters:
+        config (:class:`~transformers.BertConfig`): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+            weights.
+"""
+BERT_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`):
+            Indices of input sequence tokens in the vocabulary.
+            Indices can be obtained using :class:`~transformers.BertTokenizer`. See
+            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+            details.
+            `What are input IDs? <../glossary.html#input-ids>`__
+        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`({0})`, `optional`):
+            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+            `What are attention masks? <../glossary.html#attention-mask>`__
+        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
+            1]``:
+            - 0 corresponds to a `sentence A` token,
+            - 1 corresponds to a `sentence B` token.
+            `What are token type IDs? <../glossary.html#token-type-ids>`_
+        position_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
+            config.max_position_embeddings - 1]``.
+            `What are position IDs? <../glossary.html#position-ids>`_
+        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`({0}, hidden_size)`, `optional`):
+            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
+            This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+            vectors than the model's internal embedding lookup matrix.
+        output_attentions (:obj:`bool`, `optional`):
+            Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+            tensors for more detail.
+        output_hidden_states (:obj:`bool`, `optional`):
+            Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+            more detail.
+        return_dict (:obj:`bool`, `optional`):
+            Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+"""
+@add_start_docstrings(
+    "The bare Bert Model transformer outputting raw hidden-states without any specific head on top.",
+    BERT_START_DOCSTRING,
+)
+class BertModel(BertPreTrainedModel):
+    """
+    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
+    cross-attention is added between the self-attention layers, following the architecture described in `Attention is
+    all you need <https://arxiv.org/abs/1706.03762>`__ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
+    Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
+    argument and :obj:`add_cross_attention` set to :obj:`True`; an :obj:`encoder_hidden_states` is then expected as an
+    input to the forward pass.
+    """
+    def __init__(self, config, add_pooling_layer=True):
+        super().__init__(config)
+        self.config = config
+        self.embeddings = BertEmbeddings(config)
+        self.encoder = BertEncoder(config)
+        self.pooler = BertPooler(config) if add_pooling_layer else None
+        self.init_weights()
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+    def get_extended_attention_mask(self, attention_mask: Tensor, input_shape: Tuple[int], device: device, is_decoder: bool) -> Tensor:
+        """
+        Makes broadcastable attention and causal masks so that future and masked tokens are ignored.
+        Arguments:
+            attention_mask (:obj:`torch.Tensor`):
+                Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
+            input_shape (:obj:`Tuple[int]`):
+                The shape of the input to the model.
+            device: (:obj:`torch.device`):
+                The device of the input to the model.
+        Returns:
+            :obj:`torch.Tensor` The extended attention mask, with a the same dtype as :obj:`attention_mask.dtype`.
+        """
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        if attention_mask.dim() == 3:
+            extended_attention_mask = attention_mask[:, None, :, :]
+        elif attention_mask.dim() == 2:
+            # Provided a padding mask of dimensions [batch_size, seq_length]
+            # - if the model is a decoder, apply a causal mask in addition to the padding mask
+            # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
+            if is_decoder:
+                batch_size, seq_length = input_shape
+                seq_ids = torch.arange(seq_length, device=device)
+                causal_mask = seq_ids[None, None, :].repeat(
+                    batch_size, seq_length, 1) <= seq_ids[None, :, None]
+                # in case past_key_values are used we need to add a prefix ones mask to the causal mask
+                # causal and attention masks must have same type with pytorch version < 1.3
+                causal_mask = causal_mask.to(attention_mask.dtype)
+                if causal_mask.shape[1] < attention_mask.shape[1]:
+                    prefix_seq_len = attention_mask.shape[1] - \
+                        causal_mask.shape[1]
+                    causal_mask = torch.cat(
+                        [
+                            torch.ones(
+                                (batch_size, seq_length,
+                                 prefix_seq_len), device=device, dtype=causal_mask.dtype
+                            ),
+                            causal_mask,
+                        ],
+                        axis=-1,
+                    )
+                extended_attention_mask = causal_mask[:, None,
+                                                      :, :] * attention_mask[:, None, None, :]
+            else:
+                extended_attention_mask = attention_mask[:, None, None, :]
+        else:
+            raise ValueError(
+                "Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format(
+                    input_shape, attention_mask.shape
+                )
+            )
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and -10000.0 for masked positions.
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+        extended_attention_mask = extended_attention_mask.to(
+            dtype=self.dtype)  # fp16 compatibility
+        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+        return extended_attention_mask
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        output_token_idx=None,
+        return_dict=None,
+        is_decoder=False,
+        mode='multi_modal',
+        normalize_attention=True,
+    ):
+        r"""
+        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+            If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
+            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
+            instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
+        use_cache (:obj:`bool`, `optional`):
+            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
+            decoding (see :obj:`past_key_values`).
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if is_decoder:
+            use_cache = use_cache if use_cache is not None else self.config.use_cache
+        else:
+            use_cache = False
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError(
+                "You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+            batch_size, seq_length = input_shape
+            device = input_ids.device
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+            batch_size, seq_length = input_shape
+            device = inputs_embeds.device
+        elif encoder_embeds is not None:
+            input_shape = encoder_embeds.size()[:-1]
+            batch_size, seq_length = input_shape
+            device = encoder_embeds.device
+        else:
+            raise ValueError(
+                "You have to specify either input_ids or inputs_embeds or encoder_embeds")
+        # past_key_values_length
+        past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
+        if attention_mask is None:
+            attention_mask = torch.ones(
+                ((batch_size, seq_length + past_key_values_length)), device=device)
+        if token_type_ids is None:
+            token_type_ids = torch.zeros(
+                input_shape, dtype=torch.long, device=device)
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape,
+                                                                                 device, is_decoder)
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if encoder_hidden_states is not None:
+            if type(encoder_hidden_states) == list:
+                encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states[0].size(
+                )
+            else:
+                encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
+            encoder_hidden_shape = (
+                encoder_batch_size, encoder_sequence_length)
+            if type(encoder_attention_mask) == list:
+                encoder_extended_attention_mask = [
+                    self.invert_attention_mask(mask) for mask in encoder_attention_mask]
+            elif encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(
+                    encoder_hidden_shape, device=device)
+                encoder_extended_attention_mask = self.invert_attention_mask(
+                    encoder_attention_mask)
+            else:
+                encoder_extended_attention_mask = self.invert_attention_mask(
+                    encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(
+            head_mask, self.config.num_hidden_layers)
+        if encoder_embeds is None:
+            embedding_output = self.embeddings(
+                input_ids=input_ids,
+                position_ids=position_ids,
+                token_type_ids=token_type_ids,
+                inputs_embeds=inputs_embeds,
+                past_key_values_length=past_key_values_length,
+            )
+        else:
+            embedding_output = encoder_embeds
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            output_token_idx=output_token_idx,
+            return_dict=return_dict,
+            mode=mode,
+            normalize_attention=normalize_attention,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = self.pooler(
+            sequence_output) if self.pooler is not None else None
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+        return BertModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+            token_idx=encoder_outputs.token_idx,
+        )
+@add_start_docstrings(
+    """
+    Bert Model with two heads on top as done during the pretraining: a `masked language modeling` head and a `next
+    sentence prediction (classification)` head.
+    """,
+    BERT_START_DOCSTRING,
+)
+class BertForPreTraining(BertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.bert = BertModel(config)
+        self.cls = BertPreTrainingHeads(config)
+        self.init_weights()
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+    @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @replace_return_docstrings(output_type=BertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        next_sentence_label=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape ``(batch_size, sequence_length)``, `optional`):
+            Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
+            config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
+            (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
+        next_sentence_label (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`):
+            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
+            (see :obj:`input_ids` docstring) Indices should be in ``[0, 1]``:
+            - 0 indicates sequence B is a continuation of sequence A,
+            - 1 indicates sequence B is a random sequence.
+        kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
+            Used to hide legacy arguments that have been deprecated.
+        Returns:
+        Example::
+            >>> from transformers import BertTokenizer, BertForPreTraining
+            >>> import torch
+            >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+            >>> model = BertForPreTraining.from_pretrained('bert-base-uncased')
+            >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+            >>> outputs = model(**inputs)
+            >>> prediction_logits = outputs.prediction_logits
+            >>> seq_relationship_logits = outputs.seq_relationship_logits
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output, pooled_output = outputs[:2]
+        prediction_scores, seq_relationship_score = self.cls(
+            sequence_output, pooled_output)
+        total_loss = None
+        if labels is not None and next_sentence_label is not None:
+            loss_fct = CrossEntropyLoss()
+            masked_lm_loss = loss_fct(
+                prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
+            next_sentence_loss = loss_fct(
+                seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))
+            total_loss = masked_lm_loss + next_sentence_loss
+        if not return_dict:
+            output = (prediction_scores, seq_relationship_score) + outputs[2:]
+            return ((total_loss,) + output) if total_loss is not None else output
+        return BertForPreTrainingOutput(
+            loss=total_loss,
+            prediction_logits=prediction_scores,
+            seq_relationship_logits=seq_relationship_score,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+@add_start_docstrings(
+    """Bert Model with a `language modeling` head on top for CLM fine-tuning. """, BERT_START_DOCSTRING
+)
+class BertLMHeadModel(BertPreTrainedModel):
+    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+    _keys_to_ignore_on_load_missing = [
+        r"position_ids", r"predictions.decoder.bias"]
+    def __init__(self, config):
+        super().__init__(config)
+        self.bert = BertModel(config, add_pooling_layer=False)
+        self.cls = BertOnlyMLMHead(config)
+        self.init_weights()
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+    @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        labels=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        is_decoder=True,
+        reduction='mean',
+        mode='multi_modal',
+        normalize_attention=True,
+        soft_labels=None,
+        alpha=0,
+        return_logits=False,
+    ):
+        r"""
+        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
+            ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are
+            ignored (masked), the loss is only computed for the tokens with labels n ``[0, ..., config.vocab_size]``
+        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+            If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
+            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
+            instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
+        use_cache (:obj:`bool`, `optional`):
+            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
+            decoding (see :obj:`past_key_values`).
+        Returns:
+        Example::
+            >>> from transformers import BertTokenizer, BertLMHeadModel, BertConfig
+            >>> import torch
+            >>> tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
+            >>> config = BertConfig.from_pretrained("bert-base-cased")
+            >>> model = BertLMHeadModel.from_pretrained('bert-base-cased', config=config)
+            >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+            >>> outputs = model(**inputs)
+            >>> prediction_logits = outputs.logits
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if labels is not None:
+            use_cache = False
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            is_decoder=is_decoder,
+            mode=mode,
+            normalize_attention=normalize_attention,
+        )
+        sequence_output = outputs[0]
+        prediction_scores = self.cls(sequence_output)
+        if return_logits:
+            return prediction_scores[:, :-1, :].contiguous()
+        lm_loss = None
+        if labels is not None:
+            # we are doing next-token prediction; shift prediction scores and input ids by one
+            shifted_prediction_scores = prediction_scores[:,
+                                                          :-1, :].contiguous()
+            labels = labels[:, 1:].contiguous()
+            loss_fct = CrossEntropyLoss(reduction=reduction)
+            lm_loss = loss_fct(
+                shifted_prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
+            lm_loss = lm_loss.view(prediction_scores.size(0), -1).sum(1)
+        if soft_labels is not None:
+            loss_distill = - \
+                torch.sum(F.log_softmax(shifted_prediction_scores,
+                          dim=1)*soft_labels, dim=-1)
+            loss_distill = (loss_distill * (labels != -100)).sum(1)
+            lm_loss = (1-alpha)*lm_loss + alpha*loss_distill
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return ((lm_loss,) + output) if lm_loss is not None else output
+        return CausalLMOutputWithCrossAttentions(
+            loss=lm_loss,
+            logits=prediction_scores,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+    def prepare_inputs_for_generation(self, input_ids, past=None, attention_mask=None, **model_kwargs):
+        input_shape = input_ids.shape
+        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
+        if attention_mask is None:
+            attention_mask = input_ids.new_ones(input_shape)
+        # cut decoder_input_ids if past is used
+        if past is not None:
+            input_ids = input_ids[:, -1:]
+        return {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "past_key_values": past,
+            "encoder_hidden_states": model_kwargs.get("encoder_hidden_states", None),
+            "encoder_attention_mask": model_kwargs.get("encoder_attention_mask", None),
+            "is_decoder": True,
+        }
+    def _reorder_cache(self, past, beam_idx):
+        reordered_past = ()
+        for layer_past in past:
+            reordered_past += (tuple(past_state.index_select(0, beam_idx)
+                               for past_state in layer_past),)
+        return reordered_past
+@dataclass
+class MaskedLMOutputWithDistill(MaskedLMOutput):
+    loss_aux: Optional[torch.FloatTensor] = None
+    loss_distill: Optional[torch.FloatTensor] = None
+@add_start_docstrings("""Bert Model with a `language modeling` head on top. """, BERT_START_DOCSTRING)
+class BertForMaskedLM(BertPreTrainedModel):
+    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+    _keys_to_ignore_on_load_missing = [
+        r"position_ids", r"predictions.decoder.bias"]
+    def __init__(self, config):
+        super().__init__(config)
+        self.bert = BertModel(config, add_pooling_layer=False)
+        self.cls = BertOnlyMLMHead(config)
+        self.init_weights()
+    def tie_aux_decoder_weights(self, module, aux_modules):
+        """Tie decoder weights of all `aux_modules` to `module`, (not bias)"""
+        for m in aux_modules:
+            m.predictions.decoder.weight = module.predictions.decoder.weight
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        is_decoder=False,
+        mode='multi_modal',
+        normalize_attention=True,
+        soft_labels=None,
+        alpha=0,
+        return_logits=False,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
+            config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
+            (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_embeds=encoder_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            is_decoder=is_decoder,
+            mode=mode,
+            normalize_attention=normalize_attention
+        )
+        sequence_output = outputs[0]
+        prediction_scores = self.cls(sequence_output)
+        if return_logits:
+            return prediction_scores
+        masked_lm_loss = None
+        masked_lm_loss_aux = 0.
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()  # -100 index = padding token
+            masked_lm_loss = loss_fct(
+                prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
+        if soft_labels is not None:
+            loss_distill = - \
+                torch.sum(F.log_softmax(prediction_scores, dim=1)
+                          * soft_labels, dim=-1)
+            loss_distill = loss_distill[labels != -100].mean()
+            masked_lm_loss = (1-alpha)*masked_lm_loss + alpha*loss_distill
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+        # changed from MaskedLMOutput to MaskedLMOutputWithDistill
+        return MaskedLMOutputWithDistill(
+            loss=masked_lm_loss,
+            loss_aux=masked_lm_loss_aux,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+    def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **model_kwargs):
+        input_shape = input_ids.shape
+        effective_batch_size = input_shape[0]
+        #  add a dummy token
+        assert self.config.pad_token_id is not None, "The PAD token should be defined for generation"
+        attention_mask = torch.cat(
+            [attention_mask, attention_mask.new_zeros((attention_mask.shape[0], 1))], dim=-1)
+        dummy_token = torch.full(
+            (effective_batch_size, 1), self.config.pad_token_id, dtype=torch.long, device=input_ids.device
+        )
+        input_ids = torch.cat([input_ids, dummy_token], dim=1)
+        return {"input_ids": input_ids, "attention_mask": attention_mask}
+@add_start_docstrings(
+    """Bert Model with a `next sentence prediction (classification)` head on top. """,
+    BERT_START_DOCSTRING,
+)
+class BertForNextSentencePrediction(BertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.bert = BertModel(config)
+        self.cls = BertOnlyNSPHead(config)
+        self.init_weights()
+    @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @replace_return_docstrings(output_type=NextSentencePredictorOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        **kwargs
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
+            (see ``input_ids`` docstring). Indices should be in ``[0, 1]``:
+            - 0 indicates sequence B is a continuation of sequence A,
+            - 1 indicates sequence B is a random sequence.
+        Returns:
+        Example::
+            >>> from transformers import BertTokenizer, BertForNextSentencePrediction
+            >>> import torch
+            >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+            >>> model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased')
+            >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
+            >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
+            >>> encoding = tokenizer(prompt, next_sentence, return_tensors='pt')
+            >>> outputs = model(**encoding, labels=torch.LongTensor([1]))
+            >>> logits = outputs.logits
+            >>> assert logits[0, 0] < logits[0, 1] # next sentence was random
+        """
+        if "next_sentence_label" in kwargs:
+            warnings.warn(
+                "The `next_sentence_label` argument is deprecated and will be removed in a future version, use `labels` instead.",
+                FutureWarning,
+            )
+            labels = kwargs.pop("next_sentence_label")
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        pooled_output = outputs[1]
+        seq_relationship_scores = self.cls(pooled_output)
+        next_sentence_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            next_sentence_loss = loss_fct(
+                seq_relationship_scores.view(-1, 2), labels.view(-1))
+        if not return_dict:
+            output = (seq_relationship_scores,) + outputs[2:]
+            return ((next_sentence_loss,) + output) if next_sentence_loss is not None else output
+        return NextSentencePredictorOutput(
+            loss=next_sentence_loss,
+            logits=seq_relationship_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+@add_start_docstrings(
+    """
+    Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
+    output) e.g. for GLUE tasks.
+    """,
+    BERT_START_DOCSTRING,
+)
+class BertForSequenceClassification(BertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.bert = BertModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+        self.init_weights()
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
+            config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        pooled_output = outputs[1]
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        loss = None
+        if labels is not None:
+            if self.num_labels == 1:
+                #  We are doing regression
+                loss_fct = MSELoss()
+                loss = loss_fct(logits.view(-1), labels.view(-1))
+            else:
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(
+                    logits.view(-1, self.num_labels), labels.view(-1))
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+@add_start_docstrings(
+    """
+    Bert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
+    softmax) e.g. for RocStories/SWAG tasks.
+    """,
+    BERT_START_DOCSTRING,
+)
+class BertForMultipleChoice(BertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.bert = BertModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, 1)
+        self.init_weights()
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for computing the multiple choice classification loss. Indices should be in ``[0, ...,
+            num_choices-1]`` where :obj:`num_choices` is the size of the second dimension of the input tensors. (See
+            :obj:`input_ids` above)
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
+        input_ids = input_ids.view(-1, input_ids.size(-1)
+                                   ) if input_ids is not None else None
+        attention_mask = attention_mask.view(
+            -1, attention_mask.size(-1)) if attention_mask is not None else None
+        token_type_ids = token_type_ids.view(
+            -1, token_type_ids.size(-1)) if token_type_ids is not None else None
+        position_ids = position_ids.view(-1, position_ids.size(-1)
+                                         ) if position_ids is not None else None
+        inputs_embeds = (
+            inputs_embeds.view(-1, inputs_embeds.size(-2),
+                               inputs_embeds.size(-1))
+            if inputs_embeds is not None
+            else None
+        )
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        pooled_output = outputs[1]
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        reshaped_logits = logits.view(-1, num_choices)
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(reshaped_logits, labels)
+        if not return_dict:
+            output = (reshaped_logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+        return MultipleChoiceModelOutput(
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+@add_start_docstrings(
+    """
+    Bert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
+    Named-Entity-Recognition (NER) tasks.
+    """,
+    BERT_START_DOCSTRING,
+)
+class BertForTokenClassification(BertPreTrainedModel):
+    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.bert = BertModel(config, add_pooling_layer=False)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+        self.init_weights()
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels -
+            1]``.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            # Only keep active parts of the loss
+            if attention_mask is not None:
+                active_loss = attention_mask.view(-1) == 1
+                active_logits = logits.view(-1, self.num_labels)
+                active_labels = torch.where(
+                    active_loss, labels.view(-1), torch.tensor(
+                        loss_fct.ignore_index).type_as(labels)
+                )
+                loss = loss_fct(active_logits, active_labels)
+            else:
+                loss = loss_fct(
+                    logits.view(-1, self.num_labels), labels.view(-1))
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+@add_start_docstrings(
+    """
+    Bert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
+    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
+    BERT_START_DOCSTRING,
+)
+class BertForQuestionAnswering(BertPreTrainedModel):
+    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.bert = BertModel(config, add_pooling_layer=False)
+        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
+        self.init_weights()
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        start_positions=None,
+        end_positions=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
+            sequence are not taken into account for computing the loss.
+        end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
+            sequence are not taken into account for computing the loss.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1)
+        end_logits = end_logits.squeeze(-1)
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions.clamp_(0, ignored_index)
+            end_positions.clamp_(0, ignored_index)
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+        if not return_dict:
+            output = (start_logits, end_logits) + outputs[2:]
+            return ((total_loss,) + output) if total_loss is not None else output
+        return QuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )

svitt/tokenization_bert.py ADDED Viewed

	@@ -0,0 +1,546 @@

+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for Bert."""
+import collections
+import os
+import unicodedata
+from typing import List, Optional, Tuple
+from transformers.tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {
+        "bert-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt",
+        "bert-large-uncased": "https://huggingface.co/bert-large-uncased/resolve/main/vocab.txt",
+        "bert-base-cased": "https://huggingface.co/bert-base-cased/resolve/main/vocab.txt",
+        "bert-large-cased": "https://huggingface.co/bert-large-cased/resolve/main/vocab.txt",
+        "bert-base-multilingual-uncased": "https://huggingface.co/bert-base-multilingual-uncased/resolve/main/vocab.txt",
+        "bert-base-multilingual-cased": "https://huggingface.co/bert-base-multilingual-cased/resolve/main/vocab.txt",
+        "bert-base-chinese": "https://huggingface.co/bert-base-chinese/resolve/main/vocab.txt",
+        "bert-base-german-cased": "https://huggingface.co/bert-base-german-cased/resolve/main/vocab.txt",
+        "bert-large-uncased-whole-word-masking": "https://huggingface.co/bert-large-uncased-whole-word-masking/resolve/main/vocab.txt",
+        "bert-large-cased-whole-word-masking": "https://huggingface.co/bert-large-cased-whole-word-masking/resolve/main/vocab.txt",
+        "bert-large-uncased-whole-word-masking-finetuned-squad": "https://huggingface.co/bert-large-uncased-whole-word-masking-finetuned-squad/resolve/main/vocab.txt",
+        "bert-large-cased-whole-word-masking-finetuned-squad": "https://huggingface.co/bert-large-cased-whole-word-masking-finetuned-squad/resolve/main/vocab.txt",
+        "bert-base-cased-finetuned-mrpc": "https://huggingface.co/bert-base-cased-finetuned-mrpc/resolve/main/vocab.txt",
+        "bert-base-german-dbmdz-cased": "https://huggingface.co/bert-base-german-dbmdz-cased/resolve/main/vocab.txt",
+        "bert-base-german-dbmdz-uncased": "https://huggingface.co/bert-base-german-dbmdz-uncased/resolve/main/vocab.txt",
+        "TurkuNLP/bert-base-finnish-cased-v1": "https://huggingface.co/TurkuNLP/bert-base-finnish-cased-v1/resolve/main/vocab.txt",
+        "TurkuNLP/bert-base-finnish-uncased-v1": "https://huggingface.co/TurkuNLP/bert-base-finnish-uncased-v1/resolve/main/vocab.txt",
+        "wietsedv/bert-base-dutch-cased": "https://huggingface.co/wietsedv/bert-base-dutch-cased/resolve/main/vocab.txt",
+    }
+}
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "bert-base-uncased": 512,
+    "bert-large-uncased": 512,
+    "bert-base-cased": 512,
+    "bert-large-cased": 512,
+    "bert-base-multilingual-uncased": 512,
+    "bert-base-multilingual-cased": 512,
+    "bert-base-chinese": 512,
+    "bert-base-german-cased": 512,
+    "bert-large-uncased-whole-word-masking": 512,
+    "bert-large-cased-whole-word-masking": 512,
+    "bert-large-uncased-whole-word-masking-finetuned-squad": 512,
+    "bert-large-cased-whole-word-masking-finetuned-squad": 512,
+    "bert-base-cased-finetuned-mrpc": 512,
+    "bert-base-german-dbmdz-cased": 512,
+    "bert-base-german-dbmdz-uncased": 512,
+    "TurkuNLP/bert-base-finnish-cased-v1": 512,
+    "TurkuNLP/bert-base-finnish-uncased-v1": 512,
+    "wietsedv/bert-base-dutch-cased": 512,
+}
+PRETRAINED_INIT_CONFIGURATION = {
+    "bert-base-uncased": {"do_lower_case": True},
+    "bert-large-uncased": {"do_lower_case": True},
+    "bert-base-cased": {"do_lower_case": False},
+    "bert-large-cased": {"do_lower_case": False},
+    "bert-base-multilingual-uncased": {"do_lower_case": True},
+    "bert-base-multilingual-cased": {"do_lower_case": False},
+    "bert-base-chinese": {"do_lower_case": False},
+    "bert-base-german-cased": {"do_lower_case": False},
+    "bert-large-uncased-whole-word-masking": {"do_lower_case": True},
+    "bert-large-cased-whole-word-masking": {"do_lower_case": False},
+    "bert-large-uncased-whole-word-masking-finetuned-squad": {"do_lower_case": True},
+    "bert-large-cased-whole-word-masking-finetuned-squad": {"do_lower_case": False},
+    "bert-base-cased-finetuned-mrpc": {"do_lower_case": False},
+    "bert-base-german-dbmdz-cased": {"do_lower_case": False},
+    "bert-base-german-dbmdz-uncased": {"do_lower_case": True},
+    "TurkuNLP/bert-base-finnish-cased-v1": {"do_lower_case": False},
+    "TurkuNLP/bert-base-finnish-uncased-v1": {"do_lower_case": True},
+    "wietsedv/bert-base-dutch-cased": {"do_lower_case": False},
+}
+def load_vocab(vocab_file):
+    """Loads a vocabulary file into a dictionary."""
+    vocab = collections.OrderedDict()
+    with open(vocab_file, "r", encoding="utf-8") as reader:
+        tokens = reader.readlines()
+    for index, token in enumerate(tokens):
+        token = token.rstrip("\n")
+        vocab[token] = index
+    return vocab
+def whitespace_tokenize(text):
+    """Runs basic whitespace cleaning and splitting on a piece of text."""
+    text = text.strip()
+    if not text:
+        return []
+    tokens = text.split()
+    return tokens
+class BertTokenizer(PreTrainedTokenizer):
+    r"""
+    Construct a BERT tokenizer. Based on WordPiece.
+    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
+    Users should refer to this superclass for more information regarding those methods.
+    Args:
+        vocab_file (:obj:`str`):
+            File containing the vocabulary.
+        do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether or not to lowercase the input when tokenizing.
+        do_basic_tokenize (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether or not to do basic tokenization before WordPiece.
+        never_split (:obj:`Iterable`, `optional`):
+            Collection of tokens which will never be split during tokenization. Only has an effect when
+            :obj:`do_basic_tokenize=True`
+        unk_token (:obj:`str`, `optional`, defaults to :obj:`"[UNK]"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        sep_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
+            sequence classification or for a text and a question for question answering. It is also used as the last
+            token of a sequence built with special tokens.
+        pad_token (:obj:`str`, `optional`, defaults to :obj:`"[PAD]"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        cls_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`):
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
+        mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+        tokenize_chinese_chars (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether or not to tokenize Chinese characters.
+            This should likely be deactivated for Japanese (see this `issue
+            <https://github.com/huggingface/transformers/issues/328>`__).
+        strip_accents: (:obj:`bool`, `optional`):
+            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
+            value for :obj:`lowercase` (as in the original BERT).
+    """
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    def __init__(
+        self,
+        vocab_file,
+        do_lower_case=True,
+        do_basic_tokenize=True,
+        never_split=None,
+        unk_token="[UNK]",
+        sep_token="[SEP]",
+        pad_token="[PAD]",
+        cls_token="[CLS]",
+        mask_token="[MASK]",
+        tokenize_chinese_chars=True,
+        strip_accents=None,
+        **kwargs
+    ):
+        super().__init__(
+            do_lower_case=do_lower_case,
+            do_basic_tokenize=do_basic_tokenize,
+            never_split=never_split,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            tokenize_chinese_chars=tokenize_chinese_chars,
+            strip_accents=strip_accents,
+            **kwargs,
+        )
+        if not os.path.isfile(vocab_file):
+            raise ValueError(
+                "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
+                "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(
+                    vocab_file)
+            )
+        self.vocab = load_vocab(vocab_file)
+        self.ids_to_tokens = collections.OrderedDict(
+            [(ids, tok) for tok, ids in self.vocab.items()])
+        self.do_basic_tokenize = do_basic_tokenize
+        if do_basic_tokenize:
+            self.basic_tokenizer = BasicTokenizer(
+                do_lower_case=do_lower_case,
+                never_split=never_split,
+                tokenize_chinese_chars=tokenize_chinese_chars,
+                strip_accents=strip_accents,
+            )
+        self.wordpiece_tokenizer = WordpieceTokenizer(
+            vocab=self.vocab, unk_token=self.unk_token)
+    @property
+    def do_lower_case(self):
+        return self.basic_tokenizer.do_lower_case
+    @property
+    def vocab_size(self):
+        return len(self.vocab)
+    def get_vocab(self):
+        return dict(self.vocab, **self.added_tokens_encoder)
+    def _tokenize(self, text):
+        split_tokens = []
+        if self.do_basic_tokenize:
+            for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens):
+                # If the token is part of the never_split set
+                if token in self.basic_tokenizer.never_split:
+                    split_tokens.append(token)
+                else:
+                    split_tokens += self.wordpiece_tokenizer.tokenize(token)
+        else:
+            split_tokens = self.wordpiece_tokenizer.tokenize(text)
+        return split_tokens
+    def _convert_token_to_id(self, token):
+        """ Converts a token (str) in an id using the vocab. """
+        return self.vocab.get(token, self.vocab.get(self.unk_token))
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.ids_to_tokens.get(index, self.unk_token)
+    def convert_tokens_to_string(self, tokens):
+        """ Converts a sequence of tokens (string) in a single string. """
+        out_string = " ".join(tokens).replace(" ##", "").strip()
+        return out_string
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. A BERT sequence has the following format:
+        - single sequence: ``[CLS] X ``
+        - pair of sequences: ``[CLS] A [SEP] B [SEP]``
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (:obj:`List[int]`, `optional`):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+        """
+        if token_ids_1 is None:
+            return [self.cls_token_id] + token_ids_0
+        cls = [self.cls_token_id]
+        sep = [self.sep_token_id]
+        return cls + token_ids_0 + sep + token_ids_1 + sep
+    def get_special_tokens_mask(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+    ) -> List[int]:
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer ``prepare_for_model`` method.
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of IDs.
+            token_ids_1 (:obj:`List[int]`, `optional`):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+        Returns:
+            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+        if already_has_special_tokens:
+            if token_ids_1 is not None:
+                raise ValueError(
+                    "You should not supply a second sequence if the provided sequence of "
+                    "ids is already formatted with special tokens for the model."
+                )
+            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
+        if token_ids_1 is not None:
+            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1]
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. A BERT sequence
+        pair mask has the following format:
+        ::
+            0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+            | first sequence    | second sequence |
+        If :obj:`token_ids_1` is :obj:`None`, this method only returns the first portion of the mask (0s).
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of IDs.
+            token_ids_1 (:obj:`List[int]`, `optional`):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
+            sequence(s).
+        """
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+        if token_ids_1 is None:
+            return len(cls + token_ids_0 + sep) * [0]
+        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        index = 0
+        if os.path.isdir(save_directory):
+            vocab_file = os.path.join(
+                save_directory, (filename_prefix + "-" if filename_prefix else "") +
+                VOCAB_FILES_NAMES["vocab_file"]
+            )
+        else:
+            vocab_file = (filename_prefix +
+                          "-" if filename_prefix else "") + save_directory
+        with open(vocab_file, "w", encoding="utf-8") as writer:
+            for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning(
+                        "Saving vocabulary to {}: vocabulary indices are not consecutive."
+                        " Please check that the vocabulary is not corrupted!".format(
+                            vocab_file)
+                    )
+                    index = token_index
+                writer.write(token + "\n")
+                index += 1
+        return (vocab_file,)
+class BasicTokenizer(object):
+    """
+    Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.).
+    Args:
+        do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether or not to lowercase the input when tokenizing.
+        never_split (:obj:`Iterable`, `optional`):
+            Collection of tokens which will never be split during tokenization. Only has an effect when
+            :obj:`do_basic_tokenize=True`
+        tokenize_chinese_chars (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether or not to tokenize Chinese characters.
+            This should likely be deactivated for Japanese (see this `issue
+            <https://github.com/huggingface/transformers/issues/328>`__).
+        strip_accents: (:obj:`bool`, `optional`):
+            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
+            value for :obj:`lowercase` (as in the original BERT).
+    """
+    def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
+        if never_split is None:
+            never_split = []
+        self.do_lower_case = do_lower_case
+        self.never_split = set(never_split)
+        self.tokenize_chinese_chars = tokenize_chinese_chars
+        self.strip_accents = strip_accents
+    def tokenize(self, text, never_split=None):
+        """
+        Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
+        WordPieceTokenizer.
+        Args:
+            **never_split**: (`optional`) list of str
+                Kept for backward compatibility purposes. Now implemented directly at the base class level (see
+                :func:`PreTrainedTokenizer.tokenize`) List of token not to split.
+        """
+        # union() returns a new set by concatenating the two sets.
+        never_split = self.never_split.union(
+            set(never_split)) if never_split else self.never_split
+        text = self._clean_text(text)
+        # This was added on November 1st, 2018 for the multilingual and Chinese
+        # models. This is also applied to the English models now, but it doesn't
+        # matter since the English models were not trained on any Chinese data
+        # and generally don't have any Chinese data in them (there are Chinese
+        # characters in the vocabulary because Wikipedia does have some Chinese
+        # words in the English Wikipedia.).
+        if self.tokenize_chinese_chars:
+            text = self._tokenize_chinese_chars(text)
+        orig_tokens = whitespace_tokenize(text)
+        split_tokens = []
+        for token in orig_tokens:
+            if token not in never_split:
+                if self.do_lower_case:
+                    token = token.lower()
+                    if self.strip_accents is not False:
+                        token = self._run_strip_accents(token)
+                elif self.strip_accents:
+                    token = self._run_strip_accents(token)
+            split_tokens.extend(self._run_split_on_punc(token, never_split))
+        output_tokens = whitespace_tokenize(" ".join(split_tokens))
+        return output_tokens
+    def _run_strip_accents(self, text):
+        """Strips accents from a piece of text."""
+        text = unicodedata.normalize("NFD", text)
+        output = []
+        for char in text:
+            cat = unicodedata.category(char)
+            if cat == "Mn":
+                continue
+            output.append(char)
+        return "".join(output)
+    def _run_split_on_punc(self, text, never_split=None):
+        """Splits punctuation on a piece of text."""
+        if never_split is not None and text in never_split:
+            return [text]
+        chars = list(text)
+        i = 0
+        start_new_word = True
+        output = []
+        while i < len(chars):
+            char = chars[i]
+            if _is_punctuation(char):
+                output.append([char])
+                start_new_word = True
+            else:
+                if start_new_word:
+                    output.append([])
+                start_new_word = False
+                output[-1].append(char)
+            i += 1
+        return ["".join(x) for x in output]
+    def _tokenize_chinese_chars(self, text):
+        """Adds whitespace around any CJK character."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if self._is_chinese_char(cp):
+                output.append(" ")
+                output.append(char)
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+    def _is_chinese_char(self, cp):
+        """Checks whether CP is the codepoint of a CJK character."""
+        # This defines a "chinese character" as anything in the CJK Unicode block:
+        #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+        #
+        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
+        # despite its name. The modern Korean Hangul alphabet is a different block,
+        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
+        # space-separated words, so they are not treated specially and handled
+        # like the all of the other languages.
+        if (
+            (cp >= 0x4E00 and cp <= 0x9FFF)
+            or (cp >= 0x3400 and cp <= 0x4DBF)  #
+            or (cp >= 0x20000 and cp <= 0x2A6DF)  #
+            or (cp >= 0x2A700 and cp <= 0x2B73F)  #
+            or (cp >= 0x2B740 and cp <= 0x2B81F)  #
+            or (cp >= 0x2B820 and cp <= 0x2CEAF)  #
+            or (cp >= 0xF900 and cp <= 0xFAFF)
+            or (cp >= 0x2F800 and cp <= 0x2FA1F)  #
+        ):  #
+            return True
+        return False
+    def _clean_text(self, text):
+        """Performs invalid character removal and whitespace cleanup on text."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if cp == 0 or cp == 0xFFFD or _is_control(char):
+                continue
+            if _is_whitespace(char):
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+class WordpieceTokenizer(object):
+    """Runs WordPiece tokenization."""
+    def __init__(self, vocab, unk_token, max_input_chars_per_word=100):
+        self.vocab = vocab
+        self.unk_token = unk_token
+        self.max_input_chars_per_word = max_input_chars_per_word
+    def tokenize(self, text):
+        """
+        Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
+        tokenization using the given vocabulary.
+        For example, :obj:`input = "unaffable"` wil return as output :obj:`["un", "##aff", "##able"]`.
+        Args:
+          text: A single token or whitespace separated tokens. This should have
+            already been passed through `BasicTokenizer`.
+        Returns:
+          A list of wordpiece tokens.
+        """
+        output_tokens = []
+        for token in whitespace_tokenize(text):
+            chars = list(token)
+            if len(chars) > self.max_input_chars_per_word:
+                output_tokens.append(self.unk_token)
+                continue
+            is_bad = False
+            start = 0
+            sub_tokens = []
+            while start < len(chars):
+                end = len(chars)
+                cur_substr = None
+                while start < end:
+                    substr = "".join(chars[start:end])
+                    if start > 0:
+                        substr = "##" + substr
+                    if substr in self.vocab:
+                        cur_substr = substr
+                        break
+                    end -= 1
+                if cur_substr is None:
+                    is_bad = True
+                    break
+                sub_tokens.append(cur_substr)
+                start = end
+            if is_bad:
+                output_tokens.append(self.unk_token)
+            else:
+                output_tokens.extend(sub_tokens)
+        return output_tokens

svitt/utils.py ADDED Viewed

	@@ -0,0 +1,235 @@

+import torch
+import torch.nn.functional as F
+import torch.nn as nn
+from scipy import interpolate
+import numpy as np
+from einops import rearrange, repeat
+def _init_transformer_weights(module, initializer_range=0.02):
+    """Initialize the weights. Copied from transformers ViT/Bert model init"""
+    if isinstance(module, (nn.Linear, nn.Conv2d)):
+        # Slightly different from the TF version which uses truncated_normal for initialization
+        # cf https://github.com/pytorch/pytorch/pull/5617
+        module.weight.data.normal_(mean=0.0, std=initializer_range)
+        if module.bias is not None:
+            module.bias.data.zero_()
+    elif isinstance(module, nn.Embedding):
+        module.weight.data.normal_(mean=0.0, std=initializer_range)
+        if module.padding_idx is not None:
+            module.weight.data[module.padding_idx].zero_()
+    elif isinstance(module, nn.LayerNorm):
+        module.bias.data.zero_()
+        module.weight.data.fill_(1.0)
+def interpolate_pos_embed(pos_embed_old, pos_embed_new, num_patches_new):
+    """
+    Args:
+        pos_embed_old: (1, L_old, d), pre-trained
+        pos_embed_new: (1, L_new, d), newly initialized, to be replaced by interpolated weights
+        num_patches_new:
+    """
+    # interpolate position embedding
+    embedding_size = pos_embed_old.shape[-1]
+    num_extra_tokens = pos_embed_new.shape[-2] - num_patches_new
+    # height (== width) for the checkpoint position embedding
+    orig_size = int((pos_embed_old.shape[-2] - num_extra_tokens) ** 0.5)
+    # height (== width) for the new position embedding
+    new_size = int(num_patches_new ** 0.5)
+    if orig_size != new_size:
+        # class_token and dist_token are kept unchanged
+        # the extra tokens seems always at the beginning of the position embedding
+        extra_tokens = pos_embed_old[:, :num_extra_tokens]
+        # only the position tokens are interpolated
+        pos_tokens = pos_embed_old[:, num_extra_tokens:]
+        pos_tokens = pos_tokens.reshape(
+            -1, orig_size, orig_size, embedding_size).permute(0, 3, 1, 2)
+        pos_tokens = torch.nn.functional.interpolate(
+            pos_tokens, size=(new_size, new_size), mode='bicubic', align_corners=False)
+        pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2)
+        interpolated_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1)
+        return interpolated_pos_embed
+    else:
+        return pos_embed_old
+def interpolate_pos_relative_bias_beit(state_dict_old, state_dict_new, patch_shape_new):
+    """
+    Args:
+        state_dict_old: loaded state dict
+        state_dict_new: state dict for model with new image size
+        patch_shape_new: new model patch_shape
+    ref: https://github.com/microsoft/unilm/blob/master/beit/run_class_finetuning.py
+    """
+    all_keys = list(state_dict_old.keys())
+    for key in all_keys:
+        if "relative_position_index" in key:
+            state_dict_old.pop(key)
+        if "relative_position_bias_table" in key:
+            rel_pos_bias = state_dict_old[key]
+            src_num_pos, num_attn_heads = rel_pos_bias.size()
+            dst_num_pos, _ = state_dict_new[key].size()
+            dst_patch_shape = patch_shape_new
+            if dst_patch_shape[0] != dst_patch_shape[1]:
+                raise NotImplementedError()
+            num_extra_tokens = dst_num_pos - (dst_patch_shape[0] * 2 - 1) * (dst_patch_shape[1] * 2 - 1)
+            src_size = int((src_num_pos - num_extra_tokens) ** 0.5)
+            dst_size = int((dst_num_pos - num_extra_tokens) ** 0.5)
+            if src_size != dst_size:
+                extra_tokens = rel_pos_bias[-num_extra_tokens:, :]
+                rel_pos_bias = rel_pos_bias[:-num_extra_tokens, :]
+                def geometric_progression(a, r, n):
+                    return a * (1.0 - r ** n) / (1.0 - r)
+                left, right = 1.01, 1.5
+                while right - left > 1e-6:
+                    q = (left + right) / 2.0
+                    gp = geometric_progression(1, q, src_size // 2)
+                    if gp > dst_size // 2:
+                        right = q
+                    else:
+                        left = q
+                # if q > 1.090307:
+                #     q = 1.090307
+                dis = []
+                cur = 1
+                for i in range(src_size // 2):
+                    dis.append(cur)
+                    cur += q ** (i + 1)
+                r_ids = [-_ for _ in reversed(dis)]
+                x = r_ids + [0] + dis
+                y = r_ids + [0] + dis
+                t = dst_size // 2.0
+                dx = np.arange(-t, t + 0.1, 1.0)
+                dy = np.arange(-t, t + 0.1, 1.0)
+                all_rel_pos_bias = []
+                for i in range(num_attn_heads):
+                    z = rel_pos_bias[:, i].view(src_size, src_size).float().numpy()
+                    f = interpolate.interp2d(x, y, z, kind='cubic')
+                    all_rel_pos_bias.append(
+                        torch.Tensor(f(dx, dy)).contiguous().view(-1, 1).to(rel_pos_bias.device))
+                rel_pos_bias = torch.cat(all_rel_pos_bias, dim=-1)
+                new_rel_pos_bias = torch.cat((rel_pos_bias, extra_tokens), dim=0)
+                state_dict_old[key] = new_rel_pos_bias
+    return state_dict_old
+def interpolate_pos_relative_bias_beit_3d(state_dict_old, state_dict_new, patch_shape_new, src_t_size=1):
+    """
+    Args:
+        state_dict_old: loaded state dict
+        state_dict_new: state dict for model with new image size
+        patch_shape_new: new model patch_shape
+    ref: https://github.com/microsoft/unilm/blob/master/beit/run_class_finetuning.py
+    """
+    all_keys = list(state_dict_old.keys())
+    for key in all_keys:
+        if "relative_position_index" in key:
+            state_dict_old.pop(key)
+        if "relative_position_bias_table" in key:
+            src_num_pos, num_attn_heads = state_dict_old[key].size()
+            dst_num_pos, _ = state_dict_new[key].size()
+            if src_num_pos == dst_num_pos:
+                continue
+            num_extra_tokens = dst_num_pos - np.prod([w * 2 - 1 for w in patch_shape_new])
+            src_s_size = int((src_num_pos - num_extra_tokens) / src_t_size)
+            src_size = int(src_s_size ** 0.5)
+            dst_size = patch_shape_new[-1] * 2 - 1
+            if src_size != dst_size:
+                # Spatial interpolation
+                rel_pos_bias = state_dict_old[key]
+                extra_tokens = rel_pos_bias[-num_extra_tokens:, :]
+                rel_pos_bias = rel_pos_bias[:-num_extra_tokens, :]
+                def geometric_progression(a, r, n):
+                    return a * (1.0 - r ** n) / (1.0 - r)
+                left, right = 1.01, 1.5
+                while right - left > 1e-6:
+                    q = (left + right) / 2.0
+                    gp = geometric_progression(1, q, src_size // 2)
+                    if gp > dst_size // 2:
+                        right = q
+                    else:
+                        left = q
+                # if q > 1.090307:
+                #     q = 1.090307
+                dis = []
+                cur = 1
+                for i in range(src_size // 2):
+                    dis.append(cur)
+                    cur += q ** (i + 1)
+                r_ids = [-_ for _ in reversed(dis)]
+                x = r_ids + [0] + dis
+                y = r_ids + [0] + dis
+                t = dst_size // 2.0
+                dx = np.arange(-t, t + 0.1, 1.0)
+                dy = np.arange(-t, t + 0.1, 1.0)
+                all_rel_pos_bias = []
+                for i in range(num_attn_heads):
+                    z = rel_pos_bias[:, i].view(src_size, src_size).float().numpy()
+                    f = interpolate.interp2d(x, y, z, kind='cubic')
+                    all_rel_pos_bias.append(
+                        torch.Tensor(f(dx, dy)).contiguous().view(-1, 1).to(rel_pos_bias.device))
+                rel_pos_bias = torch.cat(all_rel_pos_bias, dim=-1)
+                new_rel_pos_bias = torch.cat((rel_pos_bias, extra_tokens), dim=0)
+                state_dict_old[key] = new_rel_pos_bias
+            dst_t_size = patch_shape_new[0] * 2 - 1
+            if src_t_size != dst_t_size:
+                # Temporal interpolation
+                rel_pos_bias = state_dict_old[key]
+                extra_tokens = rel_pos_bias[-num_extra_tokens:, :]
+                rel_pos_bias = rel_pos_bias[:-num_extra_tokens, :]
+                if src_t_size == 1:
+                    rel_pos_bias = repeat(rel_pos_bias, 's d -> (t s) d', t=dst_t_size)
+                else:
+                    rel_pos_bias = rearrange(rel_pos_bias, '(t s) d -> s d t', t=src_t_size)
+                    rel_pos_bias = F.interpolate(rel_pos_bias, dst_t_size, mode='nearest')
+                    rel_pos_bias = rearrange(rel_pos_bias, 's d t -> (t s) d')
+                new_rel_pos_bias = torch.cat((rel_pos_bias, extra_tokens), dim=0)
+                state_dict_old[key] = new_rel_pos_bias
+    return state_dict_old
+def tile(x, dim, n_tile):
+    init_dim = x.size(dim)
+    repeat_idx = [1] * x.dim()
+    repeat_idx[dim] = n_tile
+    x = x.repeat(*repeat_idx)
+    order_index = torch.LongTensor(np.concatenate(
+        [init_dim * np.arange(n_tile) + i for i in range(init_dim)]))
+    return torch.index_select(x, dim, order_index.to(x.device))
+def mask_logits(target, mask):
+    return target * mask + (1 - mask) * (-1e10)

svitt/video_transforms.py ADDED Viewed

	@@ -0,0 +1,186 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import math
+from typing import Sequence
+import torch
+import torch.nn as nn
+from torchvision import transforms
+class Permute(nn.Module):
+    """
+    Permutation as an op
+    """
+    def __init__(self, ordering):
+        super().__init__()
+        self.ordering = ordering
+    def forward(self, frames):
+        """
+        Args:
+            frames in some ordering, by default (C, T, H, W)
+        Returns:
+            frames in the ordering that was specified
+        """
+        return frames.permute(self.ordering)
+class TemporalCrop(nn.Module):
+    """
+    Convert the video into smaller clips temporally.
+    """
+    def __init__(
+        self, frames_per_clip: int = 8, stride: int = 8, frame_stride: int = 1
+    ):
+        super().__init__()
+        self.frames = frames_per_clip
+        self.stride = stride
+        self.frame_stride = frame_stride
+    def forward(self, video):
+        assert video.ndim == 4, "Must be (C, T, H, W)"
+        res = []
+        for start in range(
+            0, video.size(1) - (self.frames * self.frame_stride) + 1, self.stride
+        ):
+            end = start + (self.frames) * self.frame_stride
+            res.append(video[:, start: end: self.frame_stride, ...])
+        return res
+def crop_boxes(boxes, x_offset, y_offset):
+    """
+    Peform crop on the bounding boxes given the offsets.
+    Args:
+        boxes (ndarray or None): bounding boxes to peform crop. The dimension
+            is `num boxes` x 4.
+        x_offset (int): cropping offset in the x axis.
+        y_offset (int): cropping offset in the y axis.
+    Returns:
+        cropped_boxes (ndarray or None): the cropped boxes with dimension of
+            `num boxes` x 4.
+    """
+    cropped_boxes = boxes.copy()
+    cropped_boxes[:, [0, 2]] = boxes[:, [0, 2]] - x_offset
+    cropped_boxes[:, [1, 3]] = boxes[:, [1, 3]] - y_offset
+    return cropped_boxes
+def uniform_crop(images, size, spatial_idx, boxes=None, scale_size=None):
+    """
+    Perform uniform spatial sampling on the images and corresponding boxes.
+    Args:
+        images (tensor): images to perform uniform crop. The dimension is
+            `num frames` x `channel` x `height` x `width`.
+        size (int): size of height and weight to crop the images.
+        spatial_idx (int): 0, 1, or 2 for left, center, and right crop if width
+            is larger than height. Or 0, 1, or 2 for top, center, and bottom
+            crop if height is larger than width.
+        boxes (ndarray or None): optional. Corresponding boxes to images.
+            Dimension is `num boxes` x 4.
+        scale_size (int): optinal. If not None, resize the images to scale_size before
+            performing any crop.
+    Returns:
+        cropped (tensor): images with dimension of
+            `num frames` x `channel` x `size` x `size`.
+        cropped_boxes (ndarray or None): the cropped boxes with dimension of
+            `num boxes` x 4.
+    """
+    assert spatial_idx in [0, 1, 2]
+    ndim = len(images.shape)
+    if ndim == 3:
+        images = images.unsqueeze(0)
+    height = images.shape[2]
+    width = images.shape[3]
+    if scale_size is not None:
+        if width <= height:
+            width, height = scale_size, int(height / width * scale_size)
+        else:
+            width, height = int(width / height * scale_size), scale_size
+        images = torch.nn.functional.interpolate(
+            images,
+            size=(height, width),
+            mode="bilinear",
+            align_corners=False,
+        )
+    y_offset = int(math.ceil((height - size) / 2))
+    x_offset = int(math.ceil((width - size) / 2))
+    if height > width:
+        if spatial_idx == 0:
+            y_offset = 0
+        elif spatial_idx == 2:
+            y_offset = height - size
+    else:
+        if spatial_idx == 0:
+            x_offset = 0
+        elif spatial_idx == 2:
+            x_offset = width - size
+    cropped = images[:, :, y_offset: y_offset + size, x_offset: x_offset + size]
+    cropped_boxes = crop_boxes(boxes, x_offset, y_offset) if boxes is not None else None
+    if ndim == 3:
+        cropped = cropped.squeeze(0)
+    return cropped, cropped_boxes
+class SpatialCrop(nn.Module):
+    """
+    Convert the video into 3 smaller clips spatially. Must be used after the
+        temporal crops to get spatial crops, and should be used with
+        -2 in the spatial crop at the slowfast augmentation stage (so full
+        frames are passed in here). Will return a larger list with the
+        3x spatial crops as well. It's useful for 3x4 testing (eg in SwinT)
+        or 3x10 testing in SlowFast etc.
+    """
+    def __init__(self, crop_size: int = 224, num_crops: int = 3):
+        super().__init__()
+        self.crop_size = crop_size
+        if num_crops == 6:
+            self.crops_to_ext = [0, 1, 2]
+            # I guess Swin uses 5 crops without flipping, but that doesn't
+            # make sense given they first resize to 224 and take 224 crops.
+            # (pg 6 of https://arxiv.org/pdf/2106.13230.pdf)
+            # So I'm assuming we can use flipped crops and that will add sth..
+            self.flipped_crops_to_ext = [0, 1, 2]
+        elif num_crops == 3:
+            self.crops_to_ext = [0, 1, 2]
+            self.flipped_crops_to_ext = []
+        elif num_crops == 1:
+            self.crops_to_ext = [1]
+            self.flipped_crops_to_ext = []
+        else:
+            raise NotImplementedError(
+                "Nothing else supported yet, "
+                "slowfast only takes 0, 1, 2 as arguments"
+            )
+    def forward(self, videos: Sequence[torch.Tensor]):
+        """
+        Args:
+            videos: A list of C, T, H, W videos.
+        Returns:
+            videos: A list with 3x the number of elements. Each video converted
+                to C, T, H', W' by spatial cropping.
+        """
+        assert isinstance(videos, list), "Must be a list of videos after temporal crops"
+        assert all([video.ndim == 4 for video in videos]), "Must be (C,T,H,W)"
+        res = []
+        for video in videos:
+            for spatial_idx in self.crops_to_ext:
+                res.append(uniform_crop(video, self.crop_size, spatial_idx)[0])
+            if not self.flipped_crops_to_ext:
+                continue
+            flipped_video = transforms.functional.hflip(video)
+            for spatial_idx in self.flipped_crops_to_ext:
+                res.append(uniform_crop(flipped_video, self.crop_size, spatial_idx)[0])
+        return res