Spaces:

d22cs051
/

Audio-Deepfake-Detection

Running

App Files Files Community

d22cs051 commited on Apr 30

Commit

8273cb9

•

0 Parent(s):

retriying pushing the code

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +35 -0
.gitignore +129 -0
Dockerfile +32 -0
README.md +11 -0
app.py +71 -0
config.py +149 -0
fairseq-a54021305d6b3c4c5959ac9395135f63202db8f1/.circleci/config.yml +159 -0
fairseq-a54021305d6b3c4c5959ac9395135f63202db8f1/.github/ISSUE_TEMPLATE.md +3 -0
fairseq-a54021305d6b3c4c5959ac9395135f63202db8f1/.github/ISSUE_TEMPLATE/bug_report.md +43 -0
fairseq-a54021305d6b3c4c5959ac9395135f63202db8f1/.github/ISSUE_TEMPLATE/documentation.md +15 -0
fairseq-a54021305d6b3c4c5959ac9395135f63202db8f1/.github/ISSUE_TEMPLATE/feature_request.md +24 -0
fairseq-a54021305d6b3c4c5959ac9395135f63202db8f1/.github/ISSUE_TEMPLATE/how-to-question.md +33 -0
fairseq-a54021305d6b3c4c5959ac9395135f63202db8f1/.github/PULL_REQUEST_TEMPLATE.md +16 -0
fairseq-a54021305d6b3c4c5959ac9395135f63202db8f1/.github/stale.yml +30 -0
fairseq-a54021305d6b3c4c5959ac9395135f63202db8f1/.github/workflows/build.yml +60 -0
fairseq-a54021305d6b3c4c5959ac9395135f63202db8f1/.github/workflows/build_wheels.yml +41 -0
fairseq-a54021305d6b3c4c5959ac9395135f63202db8f1/.gitignore +136 -0
fairseq-a54021305d6b3c4c5959ac9395135f63202db8f1/.gitmodules +4 -0
fairseq-a54021305d6b3c4c5959ac9395135f63202db8f1/.isort.cfg +2 -0
fairseq-a54021305d6b3c4c5959ac9395135f63202db8f1/.pre-commit-config.yaml +40 -0
fairseq-a54021305d6b3c4c5959ac9395135f63202db8f1/CODE_OF_CONDUCT.md +77 -0
fairseq-a54021305d6b3c4c5959ac9395135f63202db8f1/CONTRIBUTING.md +82 -0
fairseq-a54021305d6b3c4c5959ac9395135f63202db8f1/LICENSE +21 -0
fairseq-a54021305d6b3c4c5959ac9395135f63202db8f1/README.md +236 -0
fairseq-a54021305d6b3c4c5959ac9395135f63202db8f1/examples/.gitignore +2 -0
fairseq-a54021305d6b3c4c5959ac9395135f63202db8f1/examples/MMPT/.gitignore +139 -0
fairseq-a54021305d6b3c4c5959ac9395135f63202db8f1/examples/MMPT/CONFIG.md +41 -0
fairseq-a54021305d6b3c4c5959ac9395135f63202db8f1/examples/MMPT/DATASET.md +34 -0
fairseq-a54021305d6b3c4c5959ac9395135f63202db8f1/examples/MMPT/README.md +166 -0
fairseq-a54021305d6b3c4c5959ac9395135f63202db8f1/examples/MMPT/endtask.md +41 -0
fairseq-a54021305d6b3c4c5959ac9395135f63202db8f1/examples/MMPT/locallaunch.py +148 -0
fairseq-a54021305d6b3c4c5959ac9395135f63202db8f1/examples/MMPT/mmpt/__init__.py +12 -0
fairseq-a54021305d6b3c4c5959ac9395135f63202db8f1/examples/MMPT/mmpt/datasets/__init__.py +10 -0
fairseq-a54021305d6b3c4c5959ac9395135f63202db8f1/examples/MMPT/mmpt/datasets/fairseqmmdataset.py +57 -0
fairseq-a54021305d6b3c4c5959ac9395135f63202db8f1/examples/MMPT/mmpt/datasets/mmdataset.py +111 -0
fairseq-a54021305d6b3c4c5959ac9395135f63202db8f1/examples/MMPT/mmpt/evaluators/__init__.py +13 -0
fairseq-a54021305d6b3c4c5959ac9395135f63202db8f1/examples/MMPT/mmpt/evaluators/evaluator.py +54 -0
fairseq-a54021305d6b3c4c5959ac9395135f63202db8f1/examples/MMPT/mmpt/evaluators/metric.py +313 -0
fairseq-a54021305d6b3c4c5959ac9395135f63202db8f1/examples/MMPT/mmpt/evaluators/predictor.py +595 -0
fairseq-a54021305d6b3c4c5959ac9395135f63202db8f1/examples/MMPT/mmpt/losses/__init__.py +16 -0
fairseq-a54021305d6b3c4c5959ac9395135f63202db8f1/examples/MMPT/mmpt/losses/fairseqmmloss.py +63 -0
fairseq-a54021305d6b3c4c5959ac9395135f63202db8f1/examples/MMPT/mmpt/losses/loss.py +87 -0
fairseq-a54021305d6b3c4c5959ac9395135f63202db8f1/examples/MMPT/mmpt/losses/nce.py +156 -0
fairseq-a54021305d6b3c4c5959ac9395135f63202db8f1/examples/MMPT/mmpt/models/__init__.py +17 -0
fairseq-a54021305d6b3c4c5959ac9395135f63202db8f1/examples/MMPT/mmpt/models/fairseqmmmodel.py +51 -0
fairseq-a54021305d6b3c4c5959ac9395135f63202db8f1/examples/MMPT/mmpt/models/mmfusion.py +926 -0
fairseq-a54021305d6b3c4c5959ac9395135f63202db8f1/examples/MMPT/mmpt/models/mmfusionnlg.py +999 -0
fairseq-a54021305d6b3c4c5959ac9395135f63202db8f1/examples/MMPT/mmpt/models/transformermodel.py +734 -0
fairseq-a54021305d6b3c4c5959ac9395135f63202db8f1/examples/MMPT/mmpt/modules/__init__.py +10 -0
fairseq-a54021305d6b3c4c5959ac9395135f63202db8f1/examples/MMPT/mmpt/modules/mm.py +145 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,129 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/

Dockerfile ADDED Viewed

	@@ -0,0 +1,32 @@

+FROM python:3.8.1-slim-buster
+WORKDIR /code
+COPY . /code
+# RUN useradd -m -u 1000 user
+RUN apt-get update
+RUN apt-get install build-essential -y
+# RUN pip install --no-cache-dir -r requirements.txt
+RUN pip install torch==1.8.1+cu111 torchvision==0.9.1+cu111 torchaudio==0.8.1 -f https://download.pytorch.org/whl/torch_stable.html
+# WORKDIR fairseq-a54021305d6b3c4c5959ac9395135f63202db8f1
+RUN pip install -e fairseq-a54021305d6b3c4c5959ac9395135f63202db8f1/.
+RUN pip install -r requirements.txt --no-cache-dir
+RUN pip install gradio --no-cache-dir
+RUN pip install protobuf==3.20.* --no-cache-dir
+# Switch to the "user" user
+# USER user
+# Set home to the user's home directory
+# ENV HOME=/home/user \
+#     PATH=/home/user/.local/bin:$PATH
+# Set the working directory to the user's home directory
+# WORKDIR $HOME/code
+# COPY --chown=user . $HOME/code
+# RUN ls -la $HOME/code
+CMD ["python3", "app.py"]

README.md ADDED Viewed

	@@ -0,0 +1,11 @@

+---
+title: Audio Deepfake Detection
+emoji: 🐢
+colorFrom: indigo
+colorTo: purple
+sdk: docker
+pinned: false
+license: mit
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,71 @@

+import gradio as gr
+import torch
+from model import Model
+from config import Config
+import warnings
+# warnings.filterwarnings('ignore')
+# making config object
+config = Config()
+def infrence(audio_file1):
+    print(f"[LOG] Audio file: {audio_file1}")
+class DFSeparationApp:
+    def __init__(self, model_path,device="cpu"):
+        self.device = device
+        self.model = self.load_model(model_path)
+        self.model.to(self.device)
+    def load_model(self, model_path):
+        checkpoint = torch.load(model_path, map_location=torch.device("cpu"))
+        fine_tuned_model = Model(
+            args=config,
+            device=self.device
+        )
+        fine_tuned_model.load_state_dict(checkpoint["model"])
+        print("[LOG] Model loaded successfully.")
+        return fine_tuned_model
+    def predict(self, audio_file):
+        # Load the audio file
+        audio_tensor = torch.tensor(audio_file[1]).to(self.device)
+        with torch.no_grad():
+            # Make prediction
+            output = self.model(audio_tensor)
+            preds = output.argmax(dim=-1)
+            probs = output.softmax(dim=-1)
+            print(f"[LOG] Prediction: {preds.item()}")
+            print(f"[LOG] Probability: {probs.max().item()}")
+        return preds.item(), probs.max().item()
+    def run(self):
+        print(f"[LOG] Running the app...")
+        # gradio interface
+        audio_input1 = gr.Audio(label="Upload or record audio")
+        prediction = gr.Label(label="Prediction:")
+        prob = gr.Label(label="Probability:")
+        gr.Interface(
+            fn=self.predict,
+            inputs=[audio_input1],
+            outputs=[prediction, prob],
+            title="DF Separation",
+            description="This app classify the audio samples into Real and Fake.",
+            examples=[
+                ["samples/Fake/download (5).wav","1"],
+                ["samples/Fake/fake1_1.wav","1"],
+                ["samples/Real/Central Avenue 1.wav","0"],
+                ["samples/Real/hindi.mp3","0"],
+            ]
+        ).launch(quiet=False,server_name="0.0.0.0")
+if __name__ == "__main__":
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    print(f"[LOG] Device: {device}")
+    model_path = "models/for_trained_model.ckpt"  # Replace with your model path
+    app = DFSeparationApp(model_path, device=device)
+    app.run()

config.py ADDED Viewed

	@@ -0,0 +1,149 @@

+class Config:
+    def __init__(self):
+        self.custom_data_dir = 'data/Dataset_Speech_Assignment'
+        self.for2sec_data_dir = 'data/for-2seconds'
+        self.batch_size = 32
+        self.num_workers = 4
+        self.num_epochs = 50
+        self.lr = 1e-3
+        self.model_checkpoint_path = 'models/Best_LA_model_for_DF.pth'
+        ############################################################################
+        """
+        parser.add_argument('--algo', type=int, default=3,
+                        help='Rawboost algos discriptions. 0: No augmentation 1: LnL_convolutive_noise, 2: ISD_additive_noise, 3: SSI_additive_noise, 4: series algo (1+2+3), \
+                            5: series algo (1+2), 6: series algo (1+3), 7: series algo(2+3), 8: parallel algo(1,2) .default=0]')
+        # LnL_convolutive_noise parameters
+        parser.add_argument('--nBands', type=int, default=5,
+                        help='number of notch filters.The higher the number of bands, the more aggresive the distortions is.[default=5]')
+        parser.add_argument('--minF', type=int, default=20,
+                        help='minimum centre frequency [Hz] of notch filter.[default=20] ')
+        parser.add_argument('--maxF', type=int, default=8000,
+                        help='maximum centre frequency [Hz] (<sr/2)  of notch filter.[default=8000]')
+        parser.add_argument('--minBW', type=int, default=100,
+                        help='minimum width [Hz] of filter.[default=100] ')
+        parser.add_argument('--maxBW', type=int, default=1000,
+                        help='maximum width [Hz] of filter.[default=1000] ')
+        parser.add_argument('--minCoeff', type=int, default=10,
+                        help='minimum filter coefficients. More the filter coefficients more ideal the filter slope.[default=10]')
+        parser.add_argument('--maxCoeff', type=int, default=100,
+                        help='maximum filter coefficients. More the filter coefficients more ideal the filter slope.[default=100]')
+        parser.add_argument('--minG', type=int, default=0,
+                        help='minimum gain factor of linear component.[default=0]')
+        parser.add_argument('--maxG', type=int, default=0,
+                        help='maximum gain factor of linear component.[default=0]')
+        parser.add_argument('--minBiasLinNonLin', type=int, default=5,
+                        help=' minimum gain difference between linear and non-linear components.[default=5]')
+        parser.add_argument('--maxBiasLinNonLin', type=int, default=20,
+                        help=' maximum gain difference between linear and non-linear components.[default=20]')
+        parser.add_argument('--N_f', type=int, default=5,
+                        help='order of the (non-)linearity where N_f=1 refers only to linear components.[default=5]')
+        # ISD_additive_noise parameters
+        parser.add_argument('--P', type=int, default=10,
+                        help='Maximum number of uniformly distributed samples in [%].[defaul=10]')
+        parser.add_argument('--g_sd', type=int, default=2,
+                        help='gain parameters > 0. [default=2]')
+        # SSI_additive_noise parameters
+        parser.add_argument('--SNRmin', type=int, default=10,
+                        help='Minimum SNR value for coloured additive noise.[defaul=10]')
+        parser.add_argument('--SNRmax', type=int, default=40,
+                        help='Maximum SNR value for coloured additive noise.[defaul=40]')
+        """
+        ############################################################################
+        # conversion from agrparse to class object
+        self.algo = 3
+        self.nBands = 5
+        self.minF = 20
+        self.maxF = 8000
+        self.minBW = 100
+        self.maxBW = 1000
+        self.minCoeff = 10
+        self.maxCoeff = 100
+        self.minG = 0
+        self.maxG = 0
+        self.minBiasLinNonLin = 5
+        self.maxBiasLinNonLin = 20
+        self.N_f = 5
+        self.P = 10
+        self.g_sd = 2
+        self.SNRmin = 10
+        self.SNRmax = 40
+        #############################################################################
+        """
+        parser.add_argument('--database_path', type=str, default='/your/path/to/data/ASVspoof_database/DF/', help='Change this to user\'s full directory address of LA database (ASVspoof2019- for training & development (used as validation), ASVspoof2021 DF for evaluation scores). We assume that all three ASVspoof 2019 LA train, LA dev and ASVspoof2021 DF eval data folders are in the same database_path directory.')
+        '''
+        % database_path/
+        %   |- DF
+        %      |- ASVspoof2021_DF_eval/flac
+        %      |- ASVspoof2019_LA_train/flac
+        %      |- ASVspoof2019_LA_dev/flac
+        '''
+        parser.add_argument('--protocols_path', type=str, default='database/', help='Change with path to user\'s DF database protocols directory address')
+        '''
+        % protocols_path/
+        %   |- ASVspoof_LA_cm_protocols
+        %      |- ASVspoof2021.LA.cm.eval.trl.txt
+        %      |- ASVspoof2019.LA.cm.dev.trl.txt
+        %      |- ASVspoof2019.LA.cm.train.trn.txt
+        %   |- ASVspoof_DF_cm_protocols
+        %      |- ASVspoof2021.DF.cm.eval.trl.txt
+        '''
+        # Hyperparameters
+        parser.add_argument('--batch_size', type=int, default=14)
+        parser.add_argument('--num_epochs', type=int, default=100)
+        parser.add_argument('--lr', type=float, default=0.000001)
+        parser.add_argument('--weight_decay', type=float, default=0.0001)
+        parser.add_argument('--loss', type=str, default='weighted_CCE')
+        # model
+        parser.add_argument('--seed', type=int, default=1234,
+                            help='random seed (default: 1234)')
+        parser.add_argument('--model_path', type=str,
+                            default=None, help='Model checkpoint')
+        parser.add_argument('--comment', type=str, default=None,
+                            help='Comment to describe the saved model')
+        # Auxiliary arguments
+        parser.add_argument('--track', type=str, default='DF',choices=['LA', 'PA','DF'], help='LA/PA/DF')
+        parser.add_argument('--eval_output', type=str, default=None,
+                            help='Path to save the evaluation result')
+        parser.add_argument('--eval', action='store_true', default=False,
+                            help='eval mode')
+        parser.add_argument('--is_eval', action='store_true', default=False,help='eval database')
+        parser.add_argument('--eval_part', type=int, default=0)
+        # backend options
+        parser.add_argument('--cudnn-deterministic-toggle', action='store_false', \
+                            default=True,
+                            help='use cudnn-deterministic? (default true)')
+        parser.add_argument('--cudnn-benchmark-toggle', action='store_true', \
+                            default=False,
+                            help='use cudnn-benchmark? (default false)')
+        """
+        self.weight_decay = 0.0001
+        self.loss = 'weighted_CCE'
+        self.seed = 1234
+        self.model_path = "models/LA_model.pth"
+        self.comment = None
+        self.track = 'DF'
+        self.eval_output = None
+        self.eval = False
+        self.is_eval = False
+        self.eval_part = 0
+        self.cudnn_deterministic_toggle = False
+        self.cudnn_benchmark_toggle = False
+        self.wandb_config = {
+            'project': 'Speech Assignment 3',
+            'run_name': 'LA_model',
+        }

fairseq-a54021305d6b3c4c5959ac9395135f63202db8f1/.circleci/config.yml ADDED Viewed

	@@ -0,0 +1,159 @@

+# Use 2.1 for orbs
+version: 2.1
+# -------------------------------------------------------------------------------------
+# Environments to run the jobs in
+# -------------------------------------------------------------------------------------
+gpu: &gpu
+  environment:
+    CUDA_VERSION: "11.1"
+  machine:
+    image: ubuntu-1604-cuda-11.1:202012-01
+  resource_class: gpu.nvidia.medium.multi
+# -------------------------------------------------------------------------------------
+# Re-usable commands
+# -------------------------------------------------------------------------------------
+cache_key: &cache_key cache-key-{{ .Environment.CIRCLE_JOB }}-{{ checksum ".circleci/config.yml" }}-{{ checksum "setup.py"}}
+install_dep_common: &install_dep_common
+  - run:
+      name: Install Common Dependencies
+      command: |
+        source activate fairseq
+        pip install --upgrade setuptools
+        pip install bitarray boto3 deepspeed editdistance fastBPE iopath ipdb ipython pyarrow pytest sacremoses sentencepiece subword-nmt hydra-core==1.0.7 omegaconf==2.0.6
+        pip install --progress-bar off pytest
+        pip install --progress-bar off fairscale
+        pip install -i https://test.pypi.org/simple/ bitsandbytes-cuda111 -U
+        python -c 'import torch; print("Torch version:", torch.__version__)'
+        python -m torch.utils.collect_env
+install_dep_fused_ops: &install_dep_fused_ops
+  - run:
+      name: Install Megatron/Apex Dependencies
+      working_directory: ~/
+      command: |
+        source activate fairseq
+        git clone https://github.com/NVIDIA/apex
+        cd apex
+        git checkout e2083df5eb96643c61613b9df48dd4eea6b07690
+        pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" --global-option="--deprecated_fused_adam" --global-option="--xentropy" --global-option="--fast_multihead_attn" ./
+        cd ~/
+        git clone --depth=1 --branch v2.4 https://github.com/NVIDIA/Megatron-LM.git
+        cd Megatron-LM
+        pip install -e .
+install_dep_pt19: &install_dep_pt19
+  - run:
+      name: Install Pytorch Dependencies
+      command: |
+        source activate fairseq
+        pip install --upgrade setuptools
+        pip install torch==1.9.1+cu111 torchvision==0.10.1+cu111 torchaudio==0.9.1 -f https://download.pytorch.org/whl/torch_stable.html
+        python -c 'import torch; print("Torch version:", torch.__version__)'
+install_dep_pt18: &install_dep_pt18
+  - run:
+      name: Install Pytorch Dependencies
+      command: |
+        source activate fairseq
+        pip install --upgrade setuptools
+        pip install torch==1.8.1+cu111 torchvision==0.9.1+cu111 torchaudio==0.8.1 -f https://download.pytorch.org/whl/torch_stable.html
+        python -c 'import torch; print("Torch version:", torch.__version__)'
+install_repo: &install_repo
+  - run:
+      name: Install Repository
+      command: |
+        source activate fairseq
+        pip install .
+        python setup.py build_ext --inplace
+run_unittests: &run_unittests
+  - run:
+      name: Run Unit Tests
+      command: |
+        source activate fairseq
+        pytest tests/gpu/test_binaries_gpu.py
+check_nvidia_driver: &check_nvidia_driver
+  - run:
+      name: Check NVIDIA Driver
+      working_directory: ~/
+      command: |
+        pyenv versions
+        nvidia-smi
+create_conda_env: &create_conda_env
+  - run:
+      name: Install and Create Conda Environment
+      command: |
+        curl -o ~/miniconda.sh -O  https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
+        chmod +x ~/miniconda.sh
+        ~/miniconda.sh -b -p $HOME/miniconda
+        rm ~/miniconda.sh
+        echo 'export PATH=$HOME/miniconda/bin:$PATH' >> $BASH_ENV
+        source $BASH_ENV
+        if [ ! -d ~/miniconda/envs/fairseq ]
+        then
+          conda create -y -n fairseq python=3.8
+        fi
+        source activate fairseq
+        python --version
+        pip install --upgrade pip
+# -------------------------------------------------------------------------------------
+# Jobs to run
+# -------------------------------------------------------------------------------------
+jobs:
+  gpu_tests_pt19:
+    <<: *gpu
+    working_directory: ~/fairseq-py
+    steps:
+      - checkout
+      - <<: *check_nvidia_driver
+      - <<: *create_conda_env
+      - restore_cache:
+          key: *cache_key
+      - <<: *install_dep_pt19
+      - <<: *install_dep_common
+      - <<: *install_dep_fused_ops
+      - save_cache:
+          paths:
+            - ~/miniconda/
+          key: *cache_key
+      - <<: *install_repo
+      - <<: *run_unittests
+  gpu_tests_pt18:
+    <<: *gpu
+    working_directory: ~/fairseq-py
+    steps:
+      - checkout
+      - <<: *check_nvidia_driver
+      - <<: *create_conda_env
+      - restore_cache:
+          key: *cache_key
+      - <<: *install_dep_pt18
+      - <<: *install_dep_common
+      - <<: *install_dep_fused_ops
+      - save_cache:
+          paths:
+            - ~/miniconda/
+          key: *cache_key
+      - <<: *install_repo
+      - <<: *run_unittests
+workflows:
+  version: 2
+  build:
+    jobs:
+      - gpu_tests_pt18
+      - gpu_tests_pt19

fairseq-a54021305d6b3c4c5959ac9395135f63202db8f1/.github/ISSUE_TEMPLATE.md ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ ## 👉 [Please follow one of these issue templates](https://github.com/pytorch/fairseq/issues/new/choose) 👈
2	+
3	+ Note: to keep the backlog clean and actionable, issues may be immediately closed if they do not follow one of the above issue templates.

fairseq-a54021305d6b3c4c5959ac9395135f63202db8f1/.github/ISSUE_TEMPLATE/bug_report.md ADDED Viewed

	@@ -0,0 +1,43 @@

+---
+name: 🐛 Bug Report
+about: Submit a bug report to help us improve
+labels: 'bug, needs triage'
+---
+## 🐛 Bug
+<!-- A clear and concise description of what the bug is. -->
+### To Reproduce
+Steps to reproduce the behavior (**always include the command you ran**):
+1. Run cmd '....'
+2. See error
+<!-- If you have a code sample, error messages, stack traces, please provide it here as well -->
+#### Code sample
+<!-- Ideally attach a minimal code sample to reproduce the decried issue.
+Minimal means having the shortest code but still preserving the bug. -->
+### Expected behavior
+<!-- A clear and concise description of what you expected to happen. -->
+### Environment
+ - fairseq Version (e.g., 1.0 or main):
+ - PyTorch Version (e.g., 1.0)
+ - OS (e.g., Linux):
+ - How you installed fairseq (`pip`, source):
+ - Build command you used (if compiling from source):
+ - Python version:
+ - CUDA/cuDNN version:
+ - GPU models and configuration:
+ - Any other relevant information:
+### Additional context
+<!-- Add any other context about the problem here. -->

fairseq-a54021305d6b3c4c5959ac9395135f63202db8f1/.github/ISSUE_TEMPLATE/documentation.md ADDED Viewed

	@@ -0,0 +1,15 @@

+---
+name: 📚 Documentation/Typos
+about: Report an issue related to documentation or a typo
+labels: 'documentation, needs triage'
+---
+## 📚 Documentation
+For typos and doc fixes, please go ahead and:
+1. Create an issue.
+2. Fix the typo.
+3. Submit a PR.
+Thanks!

fairseq-a54021305d6b3c4c5959ac9395135f63202db8f1/.github/ISSUE_TEMPLATE/feature_request.md ADDED Viewed

	@@ -0,0 +1,24 @@

+---
+name: 🚀 Feature Request
+about: Submit a proposal/request for a new feature
+labels: 'enhancement, help wanted, needs triage'
+---
+## 🚀 Feature Request
+<!-- A clear and concise description of the feature proposal -->
+### Motivation
+<!-- Please outline the motivation for the proposal. Is your feature request related to a problem? e.g., I'm always frustrated when [...]. If this is related to another GitHub issue, please link here too -->
+### Pitch
+<!-- A clear and concise description of what you want to happen. -->
+### Alternatives
+<!-- A clear and concise description of any alternative solutions or features you've considered, if any. -->
+### Additional context
+<!-- Add any other context or screenshots about the feature request here. -->

fairseq-a54021305d6b3c4c5959ac9395135f63202db8f1/.github/ISSUE_TEMPLATE/how-to-question.md ADDED Viewed

	@@ -0,0 +1,33 @@

+---
+name: ❓ Questions/Help
+about: If you have questions, please first search existing issues and docs
+labels: 'question, needs triage'
+---
+## ❓ Questions and Help
+### Before asking:
+1. search the issues.
+2. search the docs.
+<!-- If you still can't find what you need: -->
+#### What is your question?
+#### Code
+<!-- Please paste a code snippet if your question requires it! -->
+#### What have you tried?
+#### What's your environment?
+ - fairseq Version (e.g., 1.0 or main):
+ - PyTorch Version (e.g., 1.0)
+ - OS (e.g., Linux):
+ - How you installed fairseq (`pip`, source):
+ - Build command you used (if compiling from source):
+ - Python version:
+ - CUDA/cuDNN version:
+ - GPU models and configuration:
+ - Any other relevant information:

fairseq-a54021305d6b3c4c5959ac9395135f63202db8f1/.github/PULL_REQUEST_TEMPLATE.md ADDED Viewed

	@@ -0,0 +1,16 @@

+# Before submitting
+- [ ] Was this discussed/approved via a Github issue? (no need for typos, doc improvements)
+- [ ] Did you read the [contributor guideline](https://github.com/pytorch/fairseq/blob/main/CONTRIBUTING.md)?
+- [ ] Did you make sure to update the docs?
+- [ ] Did you write any new necessary tests?
+## What does this PR do?
+Fixes # (issue).
+## PR review
+Anyone in the community is free to review the PR once the tests have passed.
+If we didn't discuss your PR in Github issues there's a high chance it will not be merged.
+## Did you have fun?
+Make sure you had fun coding 🙃

fairseq-a54021305d6b3c4c5959ac9395135f63202db8f1/.github/stale.yml ADDED Viewed

	@@ -0,0 +1,30 @@

+# Configuration for probot-stale - https://github.com/probot/stale
+# Mostly copied from github.com/facebook/react/blob/master/.github/stale.yml
+# Number of days of inactivity before an issue becomes stale
+daysUntilStale: 90
+# Number of days of inactivity before a stale issue is closed
+daysUntilClose: 7
+# Issues with these labels will never be considered stale
+exemptLabels:
+  - bug
+# Label to use when marking an issue as stale
+staleLabel: stale
+issues:
+  # Comment to post when marking an issue as stale.
+  markComment: >
+    This issue has been automatically marked as stale.
+    **If this issue is still affecting you, please leave any comment** (for example, "bump"), and we'll keep it open.
+    We are sorry that we haven't been able to prioritize it yet. If you have any new additional information, please include it with your comment!
+  # Comment to post when closing a stale issue.
+  closeComment: >
+    Closing this issue after a prolonged period of inactivity. If this issue is still present in the latest release, please create a new issue with up-to-date information. Thank you!
+pulls:
+  # Comment to post when marking a pull request as stale.
+  markComment: >
+    This pull request has been automatically marked as stale.
+    **If this pull request is still relevant, please leave any comment** (for example, "bump"), and we'll keep it open.
+    We are sorry that we haven't been able to prioritize reviewing it yet. Your contribution is very much appreciated.
+  # Comment to post when closing a stale pull request.
+  closeComment: >
+    Closing this pull request after a prolonged period of inactivity. If this issue is still present in the latest release, please ask for this pull request to be reopened. Thank you!

fairseq-a54021305d6b3c4c5959ac9395135f63202db8f1/.github/workflows/build.yml ADDED Viewed

	@@ -0,0 +1,60 @@

+name: build
+on:
+  # Trigger the workflow on push to main or any pull request
+  push:
+    branches:
+      - main
+  pull_request:
+jobs:
+  build:
+    strategy:
+      max-parallel: 4
+      matrix:
+        platform: [ubuntu-latest, macos-latest]
+        python-version: [3.8, 3.9]
+    runs-on: ${{ matrix.platform }}
+    steps:
+    - uses: actions/checkout@v2
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v2
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Conditionally install pytorch
+      if: matrix.platform == 'windows-latest'
+      run: pip3 install torch -f https://download.pytorch.org/whl/torch_stable.html
+    - name: Install locally
+      run: |
+        python -m pip install --upgrade pip
+        git submodule update --init --recursive
+        python setup.py build_ext --inplace
+        python -m pip install --editable .
+    - name: Install optional test requirements
+      run: |
+        python -m pip install iopath transformers pyarrow
+        python -m pip install git+https://github.com/facebookresearch/fairscale.git@main
+    - name: Lint with flake8
+      run: |
+        pip install flake8
+        # stop the build if there are Python syntax errors or undefined names
+        flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics --extend-exclude fairseq/model_parallel/megatron
+        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
+        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics --extend-exclude fairseq/model_parallel/megatron
+    - name: Run tests
+      run: |
+          python setup.py test
+    - name: Lint with black
+      run: |
+        pip install black
+        black --check . --extend-exclude 'examples|fairseq\/model_parallel\/megatron'

fairseq-a54021305d6b3c4c5959ac9395135f63202db8f1/.github/workflows/build_wheels.yml ADDED Viewed

	@@ -0,0 +1,41 @@

+name: build_wheels
+on:
+  push:
+    branches:
+      - v[0-9]+.[0-9]+.[x0-9]+
+    tags:
+      - v*
+jobs:
+  build_wheels:
+    name: Build wheels on ${{ matrix.os }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        os: [ubuntu-latest, macos-latest]
+    steps:
+      - uses: actions/checkout@v2
+      - name: Install Python
+        uses: actions/setup-python@v2
+        with:
+          python-version: '3.7'
+      - name: Install cibuildwheel
+        run: |
+          python -m pip install cibuildwheel
+      - name: Build wheels for CPython
+        run: |
+          python -m cibuildwheel --output-dir dist
+        env:
+          CIBW_BUILD: "cp36-*64 cp37-*64 cp38-*64"
+          CIBW_MANYLINUX_X86_64_IMAGE: manylinux1
+          CIBW_BEFORE_BUILD: git submodule update --init --recursive && pip install .
+      - uses: actions/upload-artifact@v2
+        with:
+          name: wheels
+          path: ./dist/*.whl

fairseq-a54021305d6b3c4c5959ac9395135f63202db8f1/.gitignore ADDED Viewed

	@@ -0,0 +1,136 @@

+# JetBrains PyCharm IDE
+.idea/
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# macOS dir files
+.DS_Store
+# Distribution / packaging
+.Python
+env/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+# Checkpoints
+checkpoints
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# pyenv
+.python-version
+# celery beat schedule file
+celerybeat-schedule
+# SageMath parsed files
+*.sage.py
+# dotenv
+.env
+# virtualenv
+.venv
+venv/
+ENV/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+# Generated files
+/fairseq/temporal_convolution_tbc
+/fairseq/modules/*_layer/*_forward.cu
+/fairseq/modules/*_layer/*_backward.cu
+/fairseq/version.py
+# data
+data-bin/
+# reranking
+/examples/reranking/rerank_data
+# Cython-generated C++ source files
+/fairseq/data/data_utils_fast.cpp
+/fairseq/data/token_block_utils_fast.cpp
+# VSCODE
+.vscode/ftp-sync.json
+.vscode/settings.json
+# Experimental Folder
+experimental/*
+# Weights and Biases logs
+wandb/

fairseq-a54021305d6b3c4c5959ac9395135f63202db8f1/.gitmodules ADDED Viewed

	@@ -0,0 +1,4 @@

+[submodule "fairseq/model_parallel/megatron"]
+    path = fairseq/model_parallel/megatron
+    url = https://github.com/ngoyal2707/Megatron-LM
+    branch = fairseq

fairseq-a54021305d6b3c4c5959ac9395135f63202db8f1/.isort.cfg ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ [settings]
2	+ known_third_party = _cffi_backend,agg_results,aml,bitarray,boto3,botocore,dump_hubert_feature,dynamicconv_cuda,editdistance,faiss,fasttext,feature_utils,ffmpeg,g2p_en,h5py,hydra,hypothesis,indicnlp,inflect,iopath,joblib,kaldi_io,kenlm,libfb,librosa,lightconv_cuda,matplotlib,misc,mmpt,mmpt_cli,model,nltk,npy_append_array,numpy,omegaconf,pandas,pathbuilder,preprocessing,progressbar,pythainlp,random_sequence_shuffler,regex,sacrebleu,sacremoses,scipy,sentencepiece,setuptools,six,sklearn,soundfile,sweep,sweep_wmt_en2de_transformer_big_common,tabulate,torch,torchaudio,tqdm,unidecode,utils,videoreader,wav2vec_cluster_faiss,wget,yaml

fairseq-a54021305d6b3c4c5959ac9395135f63202db8f1/.pre-commit-config.yaml ADDED Viewed

	@@ -0,0 +1,40 @@

+exclude: 'build|stubs'
+default_language_version:
+    python: python3
+repos:
+-   repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.0.1
+    hooks:
+    -   id: trailing-whitespace
+    -   id: check-ast
+    -   id: check-merge-conflict
+    -   id: no-commit-to-branch
+        args: ['--branch=master']
+    -   id: check-added-large-files
+        args: ['--maxkb=500']
+    -   id: end-of-file-fixer
+-   repo: https://github.com/ambv/black
+    rev: 21.12b0
+    hooks:
+    - id: black
+      language_version: python3.8
+-   repo: https://gitlab.com/pycqa/flake8
+    rev: 3.9.2
+    hooks:
+    -   id: flake8
+        args: [
+            # only error for syntax errors and undefined names
+            "--select=E9,F63,F7,F82",
+        ]
+-   repo: https://github.com/pycqa/isort
+    rev: 5.10.1
+    hooks:
+    -   id: isort
+        exclude: README.md
+        additional_dependencies: [toml]
+        args: ["--profile", "black"]

fairseq-a54021305d6b3c4c5959ac9395135f63202db8f1/CODE_OF_CONDUCT.md ADDED Viewed

	@@ -0,0 +1,77 @@

+# Code of Conduct
+## Our Pledge
+In the interest of fostering an open and welcoming environment, we as
+contributors and maintainers pledge to make participation in our project and
+our community a harassment-free experience for everyone, regardless of age, body
+size, disability, ethnicity, sex characteristics, gender identity and expression,
+level of experience, education, socio-economic status, nationality, personal
+appearance, race, religion, or sexual identity and orientation.
+## Our Standards
+Examples of behavior that contributes to creating a positive environment
+include:
+* Using welcoming and inclusive language
+* Being respectful of differing viewpoints and experiences
+* Gracefully accepting constructive criticism
+* Focusing on what is best for the community
+* Showing empathy towards other community members
+Examples of unacceptable behavior by participants include:
+* The use of sexualized language or imagery and unwelcome sexual attention or
+  advances
+* Trolling, insulting/derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others' private information, such as a physical or electronic
+  address, without explicit permission
+* Other conduct which could reasonably be considered inappropriate in a
+  professional setting
+## Our Responsibilities
+Project maintainers are responsible for clarifying the standards of acceptable
+behavior and are expected to take appropriate and fair corrective action in
+response to any instances of unacceptable behavior.
+Project maintainers have the right and responsibility to remove, edit, or
+reject comments, commits, code, wiki edits, issues, and other contributions
+that are not aligned to this Code of Conduct, or to ban temporarily or
+permanently any contributor for other behaviors that they deem inappropriate,
+threatening, offensive, or harmful.
+## Scope
+This Code of Conduct applies within all project spaces, and it also applies when
+an individual is representing the project or its community in public spaces.
+Examples of representing a project or community include using an official
+project e-mail address, posting via an official social media account, or acting
+as an appointed representative at an online or offline event. Representation of
+a project may be further defined and clarified by project maintainers.
+## Enforcement
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported by contacting the project team at <conduct@pytorch.org>. All
+complaints will be reviewed and investigated and will result in a response that
+is deemed necessary and appropriate to the circumstances. The project team is
+obligated to maintain confidentiality with regard to the reporter of an incident.
+Further details of specific enforcement policies may be posted separately.
+Project maintainers who do not follow or enforce the Code of Conduct in good
+faith may face temporary or permanent repercussions as determined by other
+members of the project's leadership.
+## Attribution
+This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
+available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
+[homepage]: https://www.contributor-covenant.org
+For answers to common questions about this code of conduct, see
+https://www.contributor-covenant.org/faq

fairseq-a54021305d6b3c4c5959ac9395135f63202db8f1/CONTRIBUTING.md ADDED Viewed

	@@ -0,0 +1,82 @@

+# Contributing to Facebook AI Research Sequence-to-Sequence Toolkit (fairseq)
+We want to make contributing to this project as easy and transparent as
+possible.
+## Pull Requests
+We actively welcome your pull requests.
+1. Fork the repo and create your branch from `main`.
+2. If you've added code that should be tested, add tests.
+3. If you've changed APIs, update the documentation.
+4. Ensure the test suite passes.
+5. Make sure your code lints.
+6. If you haven't already, complete the Contributor License Agreement ("CLA").
+## Contributor License Agreement ("CLA")
+In order to accept your pull request, we need you to submit a CLA. You only need
+to do this once to work on any of Facebook's open source projects.
+Complete your CLA here: <https://code.facebook.com/cla>
+## Issues
+We use GitHub issues to track public bugs. Please ensure your description is
+clear and has sufficient instructions to be able to reproduce the issue.
+## License
+By contributing to Facebook AI Research Sequence-to-Sequence Toolkit (fairseq),
+you agree that your contributions will be licensed under the LICENSE file in
+the root directory of this source tree.
+## Pre-commit hooks
+In order to ensure your code lints, there are pre-commit hooks configured in the repository which you can install.
+After installation, they will automatically run each time you commit.
+An abbreviated guide is given below; for more information, refer to [the offical pre-commit documentation](https://pre-commit.com/).
+### Installation
+```
+pip install pre-commit
+pre-commit install
+```
+### Usage
+Just commit your changes:
+```
+git commit -m "My informative commit message"
+```
+If there was a failure, you will get feedback
+```
+[INFO] Initializing environment for https://github.com/PyCQA/flake8.
+[INFO] Installing environment for https://github.com/pre-commit/pre-commit-hooks.
+[INFO] Once installed this environment will be reused.
+[INFO] This may take a few minutes...
+[INFO] Installing environment for https://github.com/PyCQA/flake8.
+[INFO] Once installed this environment will be reused.
+[INFO] This may take a few minutes...
+Trim Trailing Whitespace.................................................Failed
+- hook id: trailing-whitespace
+- exit code: 1
+- files were modified by this hook
+Fixing examples/nllb/modeling/wmt15_benchmark/eval_langs2.sh
+Fix End of Files.........................................................Failed
+- hook id: end-of-file-fixer
+- exit code: 1
+- files were modified by this hook
+Fixing examples/few_shot/scripts/schedule_jobs_few_shot.py
+flake8...................................................................Passed
+```
+Certain hooks modify your files to comply.
+To include these modifications, you will need to add them (i.e. `git add ...`) and commit again.
+If all is well, you should see something like:
+```
+Trim Trailing Whitespace.................................................Passed
+Fix End of Files.........................................................Passed
+flake8...................................................................Passed
+[gshard-fix-ci 8698644e1] Fix lint, add pre-commit hooks
+ 10 files changed, 148 insertions(+), 110 deletions(-)
+ create mode 100644 .flake8
+ create mode 100644 .pre-commit-config.yaml
+ rename examples/nllb/modeling/wmt15_benchmark/{eval_langs2.py => eval_langs2.sh} (99%)
+ ```

fairseq-a54021305d6b3c4c5959ac9395135f63202db8f1/LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) Facebook, Inc. and its affiliates.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

fairseq-a54021305d6b3c4c5959ac9395135f63202db8f1/README.md ADDED Viewed

	@@ -0,0 +1,236 @@

+<p align="center">
+  <img src="docs/fairseq_logo.png" width="150">
+  <br />
+  <br />
+  <a href="https://github.com/pytorch/fairseq/blob/main/LICENSE"><img alt="MIT License" src="https://img.shields.io/badge/license-MIT-blue.svg" /></a>
+  <a href="https://github.com/pytorch/fairseq/releases"><img alt="Latest Release" src="https://img.shields.io/github/release/pytorch/fairseq.svg" /></a>
+  <a href="https://github.com/pytorch/fairseq/actions?query=workflow:build"><img alt="Build Status" src="https://github.com/pytorch/fairseq/workflows/build/badge.svg" /></a>
+  <a href="https://fairseq.readthedocs.io/en/latest/?badge=latest"><img alt="Documentation Status" src="https://readthedocs.org/projects/fairseq/badge/?version=latest" /></a>
+</p>
+--------------------------------------------------------------------------------
+Fairseq(-py) is a sequence modeling toolkit that allows researchers and
+developers to train custom models for translation, summarization, language
+modeling and other text generation tasks.
+We provide reference implementations of various sequence modeling papers:
+<details><summary>List of implemented papers</summary><p>
+* **Convolutional Neural Networks (CNN)**
+  + [Language Modeling with Gated Convolutional Networks (Dauphin et al., 2017)](examples/language_model/conv_lm/README.md)
+  + [Convolutional Sequence to Sequence Learning (Gehring et al., 2017)](examples/conv_seq2seq/README.md)
+  + [Classical Structured Prediction Losses for Sequence to Sequence Learning (Edunov et al., 2018)](https://github.com/pytorch/fairseq/tree/classic_seqlevel)
+  + [Hierarchical Neural Story Generation (Fan et al., 2018)](examples/stories/README.md)
+  + [wav2vec: Unsupervised Pre-training for Speech Recognition (Schneider et al., 2019)](examples/wav2vec/README.md)
+* **LightConv and DynamicConv models**
+  + [Pay Less Attention with Lightweight and Dynamic Convolutions (Wu et al., 2019)](examples/pay_less_attention_paper/README.md)
+* **Long Short-Term Memory (LSTM) networks**
+  + Effective Approaches to Attention-based Neural Machine Translation (Luong et al., 2015)
+* **Transformer (self-attention) networks**
+  + Attention Is All You Need (Vaswani et al., 2017)
+  + [Scaling Neural Machine Translation (Ott et al., 2018)](examples/scaling_nmt/README.md)
+  + [Understanding Back-Translation at Scale (Edunov et al., 2018)](examples/backtranslation/README.md)
+  + [Adaptive Input Representations for Neural Language Modeling (Baevski and Auli, 2018)](examples/language_model/README.adaptive_inputs.md)
+  + [Lexically constrained decoding with dynamic beam allocation (Post & Vilar, 2018)](examples/constrained_decoding/README.md)
+  + [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context (Dai et al., 2019)](examples/truncated_bptt/README.md)
+  + [Adaptive Attention Span in Transformers (Sukhbaatar et al., 2019)](examples/adaptive_span/README.md)
+  + [Mixture Models for Diverse Machine Translation: Tricks of the Trade (Shen et al., 2019)](examples/translation_moe/README.md)
+  + [RoBERTa: A Robustly Optimized BERT Pretraining Approach (Liu et al., 2019)](examples/roberta/README.md)
+  + [Facebook FAIR's WMT19 News Translation Task Submission (Ng et al., 2019)](examples/wmt19/README.md)
+  + [Jointly Learning to Align and Translate with Transformer Models (Garg et al., 2019)](examples/joint_alignment_translation/README.md )
+  + [Multilingual Denoising Pre-training for Neural Machine Translation (Liu et at., 2020)](examples/mbart/README.md)
+  + [Neural Machine Translation with Byte-Level Subwords (Wang et al., 2020)](examples/byte_level_bpe/README.md)
+  + [Unsupervised Quality Estimation for Neural Machine Translation (Fomicheva et al., 2020)](examples/unsupervised_quality_estimation/README.md)
+  + [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations (Baevski et al., 2020)](examples/wav2vec/README.md)
+  + [Generating Medical Reports from Patient-Doctor Conversations Using Sequence-to-Sequence Models (Enarvi et al., 2020)](examples/pointer_generator/README.md)
+  + [Linformer: Self-Attention with Linear Complexity (Wang et al., 2020)](examples/linformer/README.md)
+  + [Cross-lingual Retrieval for Iterative Self-Supervised Training (Tran et al., 2020)](examples/criss/README.md)
+  + [Deep Transformers with Latent Depth (Li et al., 2020)](examples/latent_depth/README.md)
+  + [Unsupervised Cross-lingual Representation Learning for Speech Recognition (Conneau et al., 2020)](https://arxiv.org/abs/2006.13979)
+  + [Self-training and Pre-training are Complementary for Speech Recognition (Xu et al., 2020)](https://arxiv.org/abs/2010.11430)
+  + [Robust wav2vec 2.0: Analyzing Domain Shift in Self-Supervised Pre-Training (Hsu, et al., 2021)](https://arxiv.org/abs/2104.01027)
+  + [Unsupervised Speech Recognition (Baevski, et al., 2021)](https://arxiv.org/abs/2105.11084)
+  + [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition (Xu et al., 2021)](https://arxiv.org/abs/2109.11680)
+  + [VideoCLIP: Contrastive Pre-training for Zero-shot Video-Text Understanding (Xu et. al., 2021)](https://arxiv.org/pdf/2109.14084.pdf)
+  + [VLM: Task-agnostic Video-Language Model Pre-training for Video Understanding (Xu et. al., 2021)](https://aclanthology.org/2021.findings-acl.370.pdf)
+  + [NormFormer: Improved Transformer Pretraining with Extra Normalization (Shleifer et. al, 2021)](examples/normformer/README.md)
+* **Non-autoregressive Transformers**
+  + Non-Autoregressive Neural Machine Translation (Gu et al., 2017)
+  + Deterministic Non-Autoregressive Neural Sequence Modeling by Iterative Refinement (Lee et al. 2018)
+  + Insertion Transformer: Flexible Sequence Generation via Insertion Operations (Stern et al. 2019)
+  + Mask-Predict: Parallel Decoding of Conditional Masked Language Models (Ghazvininejad et al., 2019)
+  + [Levenshtein Transformer (Gu et al., 2019)](examples/nonautoregressive_translation/README.md)
+* **Finetuning**
+  + [Better Fine-Tuning by Reducing Representational Collapse (Aghajanyan et al. 2020)](examples/rxf/README.md)
+</p></details>
+### What's New:
+* October 2021 [Released VideoCLIP and VLM models](examples/MMPT/README.md)
+* October 2021 [Released multilingual finetuned XLSR-53 model](examples/wav2vec/README.md)
+* September 2021 [`master` branch renamed to `main`](https://github.com/github/renaming).
+* July 2021 [Released DrNMT code](examples/discriminative_reranking_nmt/README.md)
+* July 2021 [Released Robust wav2vec 2.0 model](examples/wav2vec/README.md)
+* June 2021 [Released XLMR-XL and XLMR-XXL models](examples/xlmr/README.md)
+* May 2021 [Released Unsupervised Speech Recognition code](examples/wav2vec/unsupervised/README.md)
+* March 2021 [Added full parameter and optimizer state sharding + CPU offloading](examples/fully_sharded_data_parallel/README.md)
+* February 2021 [Added LASER training code](examples/laser/README.md)
+* December 2020: [Added Adaptive Attention Span code](examples/adaptive_span/README.md)
+* December 2020: [GottBERT model and code released](examples/gottbert/README.md)
+* November 2020: Adopted the [Hydra](https://github.com/facebookresearch/hydra) configuration framework
+  * [see documentation explaining how to use it for new and existing projects](docs/hydra_integration.md)
+* November 2020: [fairseq 0.10.0 released](https://github.com/pytorch/fairseq/releases/tag/v0.10.0)
+* October 2020: [Added R3F/R4F (Better Fine-Tuning) code](examples/rxf/README.md)
+* October 2020: [Deep Transformer with Latent Depth code released](examples/latent_depth/README.md)
+* October 2020: [Added CRISS models and code](examples/criss/README.md)
+<details><summary>Previous updates</summary><p>
+* September 2020: [Added Linformer code](examples/linformer/README.md)
+* September 2020: [Added pointer-generator networks](examples/pointer_generator/README.md)
+* August 2020: [Added lexically constrained decoding](examples/constrained_decoding/README.md)
+* August 2020: [wav2vec2 models and code released](examples/wav2vec/README.md)
+* July 2020: [Unsupervised Quality Estimation code released](examples/unsupervised_quality_estimation/README.md)
+* May 2020: [Follow fairseq on Twitter](https://twitter.com/fairseq)
+* April 2020: [Monotonic Multihead Attention code released](examples/simultaneous_translation/README.md)
+* April 2020: [Quant-Noise code released](examples/quant_noise/README.md)
+* April 2020: [Initial model parallel support and 11B parameters unidirectional LM released](examples/megatron_11b/README.md)
+* March 2020: [Byte-level BPE code released](examples/byte_level_bpe/README.md)
+* February 2020: [mBART model and code released](examples/mbart/README.md)
+* February 2020: [Added tutorial for back-translation](https://github.com/pytorch/fairseq/tree/main/examples/backtranslation#training-your-own-model-wmt18-english-german)
+* December 2019: [fairseq 0.9.0 released](https://github.com/pytorch/fairseq/releases/tag/v0.9.0)
+* November 2019: [VizSeq released (a visual analysis toolkit for evaluating fairseq models)](https://facebookresearch.github.io/vizseq/docs/getting_started/fairseq_example)
+* November 2019: [CamemBERT model and code released](examples/camembert/README.md)
+* November 2019: [BART model and code released](examples/bart/README.md)
+* November 2019: [XLM-R models and code released](examples/xlmr/README.md)
+* September 2019: [Nonautoregressive translation code released](examples/nonautoregressive_translation/README.md)
+* August 2019: [WMT'19 models released](examples/wmt19/README.md)
+* July 2019: fairseq relicensed under MIT license
+* July 2019: [RoBERTa models and code released](examples/roberta/README.md)
+* June 2019: [wav2vec models and code released](examples/wav2vec/README.md)
+</p></details>
+### Features:
+* multi-GPU training on one machine or across multiple machines (data and model parallel)
+* fast generation on both CPU and GPU with multiple search algorithms implemented:
+  + beam search
+  + Diverse Beam Search ([Vijayakumar et al., 2016](https://arxiv.org/abs/1610.02424))
+  + sampling (unconstrained, top-k and top-p/nucleus)
+  + [lexically constrained decoding](examples/constrained_decoding/README.md) (Post & Vilar, 2018)
+* [gradient accumulation](https://fairseq.readthedocs.io/en/latest/getting_started.html#large-mini-batch-training-with-delayed-updates) enables training with large mini-batches even on a single GPU
+* [mixed precision training](https://fairseq.readthedocs.io/en/latest/getting_started.html#training-with-half-precision-floating-point-fp16) (trains faster with less GPU memory on [NVIDIA tensor cores](https://developer.nvidia.com/tensor-cores))
+* [extensible](https://fairseq.readthedocs.io/en/latest/overview.html): easily register new models, criterions, tasks, optimizers and learning rate schedulers
+* [flexible configuration](docs/hydra_integration.md) based on [Hydra](https://github.com/facebookresearch/hydra) allowing a combination of code, command-line and file based configuration
+* [full parameter and optimizer state sharding](examples/fully_sharded_data_parallel/README.md)
+* [offloading parameters to CPU](examples/fully_sharded_data_parallel/README.md)
+We also provide [pre-trained models for translation and language modeling](#pre-trained-models-and-examples)
+with a convenient `torch.hub` interface:
+``` python
+en2de = torch.hub.load('pytorch/fairseq', 'transformer.wmt19.en-de.single_model')
+en2de.translate('Hello world', beam=5)
+# 'Hallo Welt'
+```
+See the PyTorch Hub tutorials for [translation](https://pytorch.org/hub/pytorch_fairseq_translation/)
+and [RoBERTa](https://pytorch.org/hub/pytorch_fairseq_roberta/) for more examples.
+# Requirements and Installation
+* [PyTorch](http://pytorch.org/) version >= 1.5.0
+* Python version >= 3.6
+* For training new models, you'll also need an NVIDIA GPU and [NCCL](https://github.com/NVIDIA/nccl)
+* **To install fairseq** and develop locally:
+``` bash
+git clone https://github.com/pytorch/fairseq
+cd fairseq
+pip install --editable ./
+# on MacOS:
+# CFLAGS="-stdlib=libc++" pip install --editable ./
+# to install the latest stable release (0.10.x)
+# pip install fairseq
+```
+* **For faster training** install NVIDIA's [apex](https://github.com/NVIDIA/apex) library:
+``` bash
+git clone https://github.com/NVIDIA/apex
+cd apex
+pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" \
+  --global-option="--deprecated_fused_adam" --global-option="--xentropy" \
+  --global-option="--fast_multihead_attn" ./
+```
+* **For large datasets** install [PyArrow](https://arrow.apache.org/docs/python/install.html#using-pip): `pip install pyarrow`
+* If you use Docker make sure to increase the shared memory size either with `--ipc=host` or `--shm-size`
+ as command line options to `nvidia-docker run` .
+# Getting Started
+The [full documentation](https://fairseq.readthedocs.io/) contains instructions
+for getting started, training new models and extending fairseq with new model
+types and tasks.
+# Pre-trained models and examples
+We provide pre-trained models and pre-processed, binarized test sets for several tasks listed below,
+as well as example training and evaluation commands.
+* [Translation](examples/translation/README.md): convolutional and transformer models are available
+* [Language Modeling](examples/language_model/README.md): convolutional and transformer models are available
+We also have more detailed READMEs to reproduce results from specific papers:
+* [XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale (Babu et al., 2021)](examples/wav2vec/xlsr/README.md)
+* [Cross-lingual Retrieval for Iterative Self-Supervised Training (Tran et al., 2020)](examples/criss/README.md)
+* [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations (Baevski et al., 2020)](examples/wav2vec/README.md)
+* [Unsupervised Quality Estimation for Neural Machine Translation (Fomicheva et al., 2020)](examples/unsupervised_quality_estimation/README.md)
+* [Training with Quantization Noise for Extreme Model Compression ({Fan*, Stock*} et al., 2020)](examples/quant_noise/README.md)
+* [Neural Machine Translation with Byte-Level Subwords (Wang et al., 2020)](examples/byte_level_bpe/README.md)
+* [Multilingual Denoising Pre-training for Neural Machine Translation (Liu et at., 2020)](examples/mbart/README.md)
+* [Reducing Transformer Depth on Demand with Structured Dropout (Fan et al., 2019)](examples/layerdrop/README.md)
+* [Jointly Learning to Align and Translate with Transformer Models (Garg et al., 2019)](examples/joint_alignment_translation/README.md)
+* [Levenshtein Transformer (Gu et al., 2019)](examples/nonautoregressive_translation/README.md)
+* [Facebook FAIR's WMT19 News Translation Task Submission (Ng et al., 2019)](examples/wmt19/README.md)
+* [RoBERTa: A Robustly Optimized BERT Pretraining Approach (Liu et al., 2019)](examples/roberta/README.md)
+* [wav2vec: Unsupervised Pre-training for Speech Recognition (Schneider et al., 2019)](examples/wav2vec/README.md)
+* [Mixture Models for Diverse Machine Translation: Tricks of the Trade (Shen et al., 2019)](examples/translation_moe/README.md)
+* [Pay Less Attention with Lightweight and Dynamic Convolutions (Wu et al., 2019)](examples/pay_less_attention_paper/README.md)
+* [Understanding Back-Translation at Scale (Edunov et al., 2018)](examples/backtranslation/README.md)
+* [Classical Structured Prediction Losses for Sequence to Sequence Learning (Edunov et al., 2018)](https://github.com/pytorch/fairseq/tree/classic_seqlevel)
+* [Hierarchical Neural Story Generation (Fan et al., 2018)](examples/stories/README.md)
+* [Scaling Neural Machine Translation (Ott et al., 2018)](examples/scaling_nmt/README.md)
+* [Convolutional Sequence to Sequence Learning (Gehring et al., 2017)](examples/conv_seq2seq/README.md)
+* [Language Modeling with Gated Convolutional Networks (Dauphin et al., 2017)](examples/language_model/README.conv.md)
+# Join the fairseq community
+* Twitter: https://twitter.com/fairseq
+* Facebook page: https://www.facebook.com/groups/fairseq.users
+* Google group: https://groups.google.com/forum/#!forum/fairseq-users
+# License
+fairseq(-py) is MIT-licensed.
+The license applies to the pre-trained models as well.
+# Citation
+Please cite as:
+``` bibtex
+@inproceedings{ott2019fairseq,
+  title = {fairseq: A Fast, Extensible Toolkit for Sequence Modeling},
+  author = {Myle Ott and Sergey Edunov and Alexei Baevski and Angela Fan and Sam Gross and Nathan Ng and David Grangier and Michael Auli},
+  booktitle = {Proceedings of NAACL-HLT 2019: Demonstrations},
+  year = {2019},
+}
+```

fairseq-a54021305d6b3c4c5959ac9395135f63202db8f1/examples/.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ !/.sh
2	+ !/.md

fairseq-a54021305d6b3c4c5959ac9395135f63202db8f1/examples/MMPT/.gitignore ADDED Viewed

	@@ -0,0 +1,139 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+runs
+data
+pretrained_models
+projects/mmfusion_*
+log_test
+third-party
+python_log
+slurm_snapshot_code
+lightning_logs
+demos

fairseq-a54021305d6b3c4c5959ac9395135f63202db8f1/examples/MMPT/CONFIG.md ADDED Viewed

	@@ -0,0 +1,41 @@

+### Config Files Explained
+Taking `projects/mfmmlm.yaml` for example, which run pretraining using masked frame model (MFM) and masked language model (MLM) on a single BERT:
+```yaml
+project_dir: mfmmlm # specify the project dir for this baseline.
+run_task:
+  - how2.yaml # run pretraining on how2 when launching `projects/taskmfmmlm.yaml`
+  - [vtt.yaml, vttcap.yaml, vttqa.yaml, youcook.yaml, youcookcap.yaml, crosstask.yaml, coin.yaml] # run fine-tuning tasks.
+base_dir: task # a global template folder to specify each training task.
+task_group:
+  pretrain: # section for pretraining. Most baselines differs in this section.
+    task_list:
+      - how2.yaml # reconfig `projects/task/how2.yaml`
+    dataset:
+      aligner: MFMMLMAligner # overwrite the aligner for MFMMLM training task.
+    model:
+      model_cls: MMFusionMFMMLM # overwrite the model, which constructs negative examples for MFM on-the-fly.
+    loss:
+      loss_cls: MFMMLM # overwrite the loss as MFMMLM, which combines MFM and MLM together.
+    fairseq: # all fairseq args can be expecified under this name.
+      dataset:
+        batch_size: 128
+  finetune: # section for fine-tuning tasks, we don't need to change anything here mostly since we want to see how pretraining can contribute to finetuning.
+    task_list: # specify the list of downstream tasks, e.g., copy `projects/task/vtt.yaml` to `projects/mfmmlm`.
+      - vtt.yaml
+      - vttqa.yaml
+      - youcook.yaml
+      - youcookcap.yaml
+      - crosstask.yaml
+      - coin.yaml
+  test: # section for testing.
+    task_list:
+      - test_vtt.yaml
+      - test_vttqa.yaml
+      - test_youcook.yaml
+      - test_youcookcap.yaml
+      - test_crosstask.yaml
+      - test_crosstask_zs.yaml
+      - test_coin.yaml
+```

fairseq-a54021305d6b3c4c5959ac9395135f63202db8f1/examples/MMPT/DATASET.md ADDED Viewed

	@@ -0,0 +1,34 @@

+# Dataset
+We understand video data are challenging to download and process. For videos, we provide our preprocessing scripts under `scripts/video_feature_extractor` (deeply adapted from `https://github.com/antoine77340/video_feature_extractor`); for text, we pre-tokenizing scripts under `scripts/text_token_extractor`.
+### S3D Feature Extraction
+We use pre-trained [S3D](https://github.com/antoine77340/S3D_HowTo100M) for video feature extraction. Please place the models as `pretrained_models/s3d_dict.npy` and `pretrained_models/s3d_howto100m.pth`.
+We implement a `PathBuilder` to automatically track video ids, source video paths to their feature locations (you may need `conda install -c anaconda pandas`). Decoding may need `pip install ffmpeg-python`.
+### Howto100M
+[Howto100M](https://www.di.ens.fr/willow/research/howto100m/) is a large-scale video pre-training datasets. You may download videos by yourself and run preprocessing of our scripts.
+Several key differences of our preprocessing from existing papers: (1) we use `raw_caption.json` instead of `caption.json` to have pure self-supervision on text (`caption.json` has manual removal of stop words); (2) we remove partially duplicated texts that are originally designed for real-time readability (see `mmpt/processors/dedupprocessor.py`); (3) then we shard video/text features using `SharedTensor` in `mmpt/utils/shardedtensor.py` for fast loading during training (faster than `h5py`).
+#### Steps
+##### video
+To extract video features: edit and run `bash scripts/video_feature_extractor/how2/s3d.sh`. (consider to run this on multiple machines; by default, we store features in fp16 to save space and also for faster training).
+Split available video ids as `data/how2/how2_s3d_train.lst` and `data/how2/how2_s3d_val.lst`.
+Lastly, pack video features into `ShardedTensor` using `python scripts/video_feature_extractor/shard_feature.py`.
+##### text
+Clean captions using `python -m mmpt.processors.dedupprocessor`.
+Tokenize dedupped captions `data/how2/raw_caption_dedup.pkl` into sharded numpy arrays:
+```
+python scripts/text_token_extractor/pretokenization.py scripts/text_token_extractor/configs/bert-base-uncased.yaml
+```
+### Youcook, MSRVTT etc.
+We use the version of Youcook and MSRVTT come with Howto100M and MILNCE. Please download the data to `data/youcook` and `data/msrvtt` accordingly, you can also check `projects/task/youcook.yaml` and `projects/task/vtt.yaml` etc. in details.
+We extract features for Youcook, MSRVTT similar to the first step of Howto100M but we read text from meta data directly and perform on-the-fly tokenization.

fairseq-a54021305d6b3c4c5959ac9395135f63202db8f1/examples/MMPT/README.md ADDED Viewed

	@@ -0,0 +1,166 @@

+# VideoCLIP and VLM
+You just find this toolkit for multimodal video understanding! It contains implementation of two recent multi-modal video understanding papers [VideoCLIP](https://arxiv.org/pdf/2109.14084.pdf) (EMNLP, 2021) and [VLM](https://aclanthology.org/2021.findings-acl.370.pdf) (ACL Findings, 2021), along with high-performance toolkits that are typically lacking in existing codebase. The toolkit is desigend to contain generic performance-tuned components that can be potentially adapted to other frameworks (we initially use fairseq).
+VideoCLIP is a contrastive learning model for zero-shot transfer to retrieval/classification/sequence labeling style tasks.
+<img src="videoclip.png" width="350" class="center">
+VLM is a masked language model style pre-training using only one encoder with masked modality model (MMM) for retrieval/generation/sequence labeling style tasks.
+<img src="vlm.png" width="350" class="center">
+### News
+[Oct. 2021] Initial release of implementation for the following papers:
+[VideoCLIP: Contrastive Pre-training for Zero-shot Video-Text Understanding](https://arxiv.org/pdf/2109.14084.pdf) (Xu et. al., EMNLP 2021)
+[VLM: Task-agnostic Video-Language Model Pre-training for Video Understanding](https://aclanthology.org/2021.findings-acl.370.pdf) (Xu et. al., ACL Findings 2021)
+### Installation
+We aim to minimize the dependency of this repo on other packages.
+We use fairseq as the main trainer (no models/datasets dependency on fairseq. We will support other trainer in future):
+```
+git clone https://github.com/pytorch/fairseq
+cd fairseq
+pip install -e .  # also optionally follow fairseq README for apex installation for fp16 training.
+export MKL_THREADING_LAYER=GNU  # fairseq may need this for numpy.
+```
+Then install this toolkit:
+```
+cd examples/MMPT  # MMPT can be in any folder, not necessarily under fairseq/examples.
+pip install -e .
+```
+The code is developed under Python=3.8.8, Pytorch=1.8, cuda=11.0 with fairseq=1.0.0a0+af0389f and tested under Python=3.8.8 pytorch=1.9 cuda=11.0 fairseq=1.0.0a0+8e7bc73 during code release.
+Most models require `transformers==3.4` for API compatibility `pip install transformers==3.4`.
+In addition, some downstream tasks may need `conda install pandas`.
+### Usage
+#### Download Checkpoints
+We use pre-trained [S3D](https://github.com/antoine77340/S3D_HowTo100M) for video feature extraction. Please place the models as `pretrained_models/s3d_dict.npy` and `pretrained_models/s3d_howto100m.pth`.
+Download VideoCLIP checkpoint `https://dl.fbaipublicfiles.com/MMPT/retri/videoclip/checkpoint_best.pt` to `runs/retri/videoclip` or VLM checkpoint `https://dl.fbaipublicfiles.com/MMPT/mtm/vlm/checkpoint_best.pt` to `runs/mtm/vlm`.
+#### Demo of Inference
+run `python locallaunch.py projects/retri/videoclip.yaml --dryrun` to get all `.yaml`s for VideoCLIP.
+```python
+import torch
+from mmpt.models import MMPTModel
+model, tokenizer, aligner = MMPTModel.from_pretrained(
+    "projects/retri/videoclip/how2.yaml")
+model.eval()
+# B, T, FPS, H, W, C (VideoCLIP is trained on 30 fps of s3d)
+video_frames = torch.randn(1, 2, 30, 224, 224, 3)
+caps, cmasks = aligner._build_text_seq(
+    tokenizer("some text", add_special_tokens=False)["input_ids"]
+)
+caps, cmasks = caps[None, :], cmasks[None, :]  # bsz=1
+with torch.no_grad():
+    output = model(video_frames, caps, cmasks, return_score=True)
+print(output["score"])  # dot-product
+```
+#### Data Preparation
+See [dataset](DATASET.md) for each dataset.
+#### Global Config for Training Pipeline
+We organize a global config file for a training/testing pipeline under projects (see a detailed [explanation](CONFIG.md)). For example, VideoCLIP in `projects/retri/videoclip.yaml` and VLM is in `projects/mtm/vlm.yaml`.
+We wrap all cmds into `locallaunch.py` and `mmpt_cli/localjob.py`. You can check concrete cmds by `--dryrun` and then drop it for actual run.
+First, run `python locallaunch.py projects/retri/videoclip.yaml --dryrun` will generate configs for all configs of pre-training, zero-shot evaluation, fine-tuning and testing, for VideoCLIP under `projects/retri/videoclip`.
+Then each (either training or evaluation) process will be configed by a concrete config file (we save all complex arguments into the concrete config file for reproducibility, including fairseq args). For example, run zero-shot evaluation on youcook,
+```
+python locallaunch.py projects/retri/videoclip/test_youcook_zs.yaml --jobtype local_predict  # zero-shot evaluation.
+python locallaunch.py projects/retri/videoclip/youcook_videoclip.yaml --jobtype local_single --dryrun  # fine-tuning: use --dryrun to check cmds and drop it to make an actual run; local_small will run on two gpus (as in paper).
+python locallaunch.py projects/retri/videoclip/test_youcook_videoclip.yaml --jobtype local_predict  # testing on fine-tuned model.
+```
+Pretraining can be run as:
+```
+python locallaunch.py projects/retri/videoclip/how2.yaml --jobtype local_single --dryrun # check then drop dryrun; paper is ran on local_big as 8 gpus.
+```
+You may need to change `--jobtype`, check/extend `LocalJob` in `mmpt_cli/localjob.py` for multi-gpu/multi-node pre-training.
+The detailed instructions of pretraining and fine-tuning can be found at [pretraining instruction](pretraining.md) and [finetuning instruction](endtask.md).
+### Development
+Several components of this toolkit can be re-used for future research (and also our ongoing research).
+#### Framework Wrapper
+We currently only support fairseq, but most components can be easily fit into other frameworks like huggingface. This repo is a `--user-dir` of fairseq with fairseq wrapper. For example, `mmpt/tasks` includes a `FairseqMMTTask`, which manages `mmpt/datasets` with `FairseqDataset`, `mmpt/models` with `FairseqModel`, `mmpt/losses` with `FairseqCriterion`.
+#### Processors
+**Multi**modal research introduces the complexity on modality alignment from different input sources to losses. Inspired by [MMF](https://github.com/facebookresearch/mmf), this toolkit leverages `mmpt/processors` to handle various needs of data preprocessing and loading, **alleviating** the needs of multiple `torch.data.utils.Dataset` (that can be tricky for ablation study).
+Processors can also be decoupled from `torch.data.utils.Dataset` for offline preprocessing instead of on-the-fly data preprocessing.
+We decouple a `mmpt.MMDataset` as 3 types of processors: `MetaProcessor`, `VideoProcessor`, `TextProcessor` and `Aligner`. They can be configed in `dataset` field of a config file (e.g., see `projects/task/how2.yaml`).
+`MetaProcessor` is used to load the meta data about a dataset, aka, all video_ids of how2 dataset.
+`VideoProcessor` is used to load the video features about a dataset. For example, S3D features for each second of a video.
+`TextProcessor` is used to load the text (feature). For example, BERT pre-tokenized text clips for how2 dataset (with `start`s, `end`s of timestamps and `cap` for `token_ids`).
+`Aligner` is the core class for different baselines that prepares the training data. For example, sampling a clip, masking tokens for MLM, etc.
+#### Performance-tuned Components
+To speed up pre-training, this toolkit uses sharded features stored in mmaped numpy, backed by `ShardedTensor` in `mmpt/utils/shardedtensor.py` (adopted from MARGE paper). This reduces the loads of IO for multi-GPU training without loading all features for a video into the memory each time and `ShardedTensor` ensure features are stored in continuous disk space for near random access. This is used for both How2 video features and texts in `mmpt/processors/how2processor.py`.
+### Citation
+If this codebase is useful for your work, please cite the following papers:
+```BibTeX
+@inproceedings{xu-etal-2021-videoclip,
+    title = "{VideoCLIP}: Contrastive Pre-training for\\Zero-shot Video-Text Understanding",
+    author = "Xu, Hu  and
+      Ghosh, Gargi  and
+      Huang, Po-Yao  and
+      Okhonko, Dmytro  and
+      Aghajanyan, Armen  and
+      Metze, Florian  and
+      Zettlemoyer, Luke  and
+      Feichtenhofer, Christoph",
+    booktitle = "Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing (EMNLP)",
+    month = nov,
+    year = "2021",
+    address = "Online",
+    publisher = "Association for Computational Linguistics",
+}
+@inproceedings{xu-etal-2021-vlm,
+    title = "{VLM}: Task-agnostic Video-Language Model Pre-training for Video Understanding",
+    author = "Xu, Hu  and
+      Ghosh, Gargi  and
+      Huang, Po-Yao  and
+      Arora, Prahal  and
+      Aminzadeh, Masoumeh  and
+      Feichtenhofer, Christoph  and
+      Metze, Florian  and
+      Zettlemoyer, Luke",
+    booktitle = "Findings of the Association for Computational Linguistics: ACL-IJCNLP 2021",
+    month = aug,
+    year = "2021",
+    address = "Online",
+    publisher = "Association for Computational Linguistics",
+    url = "https://aclanthology.org/2021.findings-acl.370",
+    doi = "10.18653/v1/2021.findings-acl.370",
+    pages = "4227--4239",
+}
+```
+### Bug Reports
+This repo is in its initial stage, welcome bug reports to huxu@fb.com
+### Copyright
+The majority of Multimodal Pre-training (MMPT) is licensed under CC-BY-NC, however portions of the project are available under separate license terms: Evaluation Codes/Models: Howto100M and HuggingFace Transformers are licensed under the Apache2.0 license; COIN and NLG-eval are licensed under the MIT license; CrossTask is licensed under the BSD-3; DiDeMo is licensed under the BSD-2 license.

fairseq-a54021305d6b3c4c5959ac9395135f63202db8f1/examples/MMPT/endtask.md ADDED Viewed

	@@ -0,0 +1,41 @@

+# Zero-shot Transfer and Finetuning
+(If you are new to the ideas of `mmpt.processors`, see [README](README.md) first.)
+All finetuning datasets (specifically `processors`) are defined in `mmpt.processors.dsprocessor`.
+Given the complexity of different types of finetuning tasks, each task may have their own meta/video/text/aligner processors and `mmpt/evaluators/{Predictor,Metric}`.
+### Tasks
+Currently, we support 5 end datasets: `MSRVTT`, `Youcook`, `COIN`, `Crosstask` and `DiDeMo` with the following tasks:
+text-video retrieval: `MSRVTT`, `Youcook`, `DiDeMo`;
+video captioning: `Youcook`;
+Video Question and Answering: `MSRVTT-QA`.
+To add your own dataset, you can specify the corresponding processors and config them in the `dataset` field of a config file, such as `projects/task/vtt.yaml`.
+### Zero-shot Transfer (no Training)
+Zero-shot transfer will run the pre-trained model (e.g., VideoCLIP) directly on testing data. Configs with pattern: `projects/task/*_zs_*.yaml` are dedicated for zero-shot transfer.
+### Fine-tuning
+The training of a downstream task is similar to pretraining, execept you may need to specify the `restore_file` in `fairseq.checkpoint` and reset optimizers, see `projects/task/ft.yaml` that is included by `projects/task/vtt.yaml`.
+We typically do finetuning on 2 gpus (`local_small`).
+### Testing
+For each finetuning dataset, you may need to specify a testing config, similar to `projects/task/test_vtt.yaml`.
+We define `mmpt.evaluators.Predictor` for different types of prediction. For example, `MSRVTT` and `Youcook` are video-retrieval tasks and expecting to use `RetrievalPredictor`. You may need to define your new type of predictors and specify that in `predictor` field of a testing config.
+Each task may also have their own metric for evaluation. This can be created in `mmpt.evaluators.Metric` and specified in the `metric` field of a testing config.
+Launching a testing is as simple as training by specifying the path of a testing config:
+```python locallaunch.py projects/mfmmlm/test_vtt.yaml```
+Testing will be launched locally by default since prediction is computationally less expensive.
+### Third-party Libraries
+We list the following finetuning tasks that require third-party libraries.
+Youcook captioning: `https://github.com/Maluuba/nlg-eval`
+CrossTask: `https://github.com/DmZhukov/CrossTask`'s `dp` under `third-party/CrossTask` (`python setup.py build_ext --inplace`)

fairseq-a54021305d6b3c4c5959ac9395135f63202db8f1/examples/MMPT/locallaunch.py ADDED Viewed

	@@ -0,0 +1,148 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import argparse
+import os
+from omegaconf import OmegaConf
+from mmpt.utils import recursive_config, overwrite_dir
+from mmpt_cli.localjob import LocalJob
+class JobLauncher(object):
+    JOB_CONFIG = {
+        "local": LocalJob,
+    }
+    def __init__(self, yaml_file):
+        self.yaml_file = yaml_file
+        job_key = "local"
+        if yaml_file.endswith(".yaml"):
+            config = recursive_config(yaml_file)
+            if config.task_type is not None:
+                job_key = config.task_type.split("_")[0]
+        else:
+            raise ValueError("unknown extension of job file:", yaml_file)
+        self.job_key = job_key
+    def __call__(self, job_type=None, dryrun=False):
+        if job_type is not None:
+            self.job_key = job_type.split("_")[0]
+        print("[JobLauncher] job_key", self.job_key)
+        job = JobLauncher.JOB_CONFIG[self.job_key](
+            self.yaml_file, job_type=job_type, dryrun=dryrun)
+        return job.submit()
+class Pipeline(object):
+    """a job that loads yaml config."""
+    def __init__(self, fn):
+        """
+        load a yaml config of a job and save generated configs as yaml for each task.
+        return: a list of files to run as specified by `run_task`.
+        """
+        if fn.endswith(".py"):
+            # a python command.
+            self.backend = "python"
+            self.run_yamls = [fn]
+            return
+        job_config = recursive_config(fn)
+        if job_config.base_dir is None:  # single file job config.
+            self.run_yamls = [fn]
+            return
+        self.project_dir = os.path.join("projects", job_config.project_dir)
+        self.run_dir = os.path.join("runs", job_config.project_dir)
+        if job_config.run_task is not None:
+            run_yamls = []
+            for stage in job_config.run_task:
+                # each stage can have multiple tasks running in parallel.
+                if OmegaConf.is_list(stage):
+                    stage_yamls = []
+                    for task_file in stage:
+                        stage_yamls.append(
+                            os.path.join(self.project_dir, task_file))
+                    run_yamls.append(stage_yamls)
+                else:
+                    run_yamls.append(os.path.join(self.project_dir, stage))
+            self.run_yamls = run_yamls
+        configs_to_save = self._overwrite_task(job_config)
+        self._save_configs(configs_to_save)
+    def __getitem__(self, idx):
+        yaml_files = self.run_yamls[idx]
+        if isinstance(yaml_files, list):
+            return [JobLauncher(yaml_file) for yaml_file in yaml_files]
+        return [JobLauncher(yaml_files)]
+    def __len__(self):
+        return len(self.run_yamls)
+    def _save_configs(self, configs_to_save: dict):
+        # save
+        os.makedirs(self.project_dir, exist_ok=True)
+        for config_file in configs_to_save:
+            config = configs_to_save[config_file]
+            print("saving", config_file)
+            OmegaConf.save(config=config, f=config_file)
+    def _overwrite_task(self, job_config):
+        configs_to_save = {}
+        self.base_project_dir = os.path.join("projects", job_config.base_dir)
+        self.base_run_dir = os.path.join("runs", job_config.base_dir)
+        for config_sets in job_config.task_group:
+            overwrite_config = job_config.task_group[config_sets]
+            if (
+                overwrite_config.task_list is None
+                or len(overwrite_config.task_list) == 0
+            ):
+                print(
+                    "[warning]",
+                    job_config.task_group,
+                    "has no task_list specified.")
+            # we don't want this added to a final config.
+            task_list = overwrite_config.pop("task_list", None)
+            for config_file in task_list:
+                config_file_path = os.path.join(
+                    self.base_project_dir, config_file)
+                config = recursive_config(config_file_path)
+                # overwrite it.
+                if overwrite_config:
+                    config = OmegaConf.merge(config, overwrite_config)
+                overwrite_dir(config, self.run_dir, basedir=self.base_run_dir)
+                save_file_path = os.path.join(self.project_dir, config_file)
+                configs_to_save[save_file_path] = config
+        return configs_to_save
+def main(args):
+    job_type = args.jobtype if args.jobtype else None
+    # parse multiple pipelines.
+    pipelines = [Pipeline(fn) for fn in args.yamls.split(",")]
+    for pipe_id, pipeline in enumerate(pipelines):
+        if not hasattr(pipeline, "project_dir"):
+            for job in pipeline[0]:
+                job(job_type=job_type, dryrun=args.dryrun)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("yamls", type=str)
+    parser.add_argument(
+        "--dryrun",
+        action="store_true",
+        help="run config and prepare to submit without launch the job.",
+    )
+    parser.add_argument(
+        "--jobtype", type=str, default="",
+        help="force to run jobs as specified.")
+    args = parser.parse_args()
+    main(args)

fairseq-a54021305d6b3c4c5959ac9395135f63202db8f1/examples/MMPT/mmpt/__init__.py ADDED Viewed

	@@ -0,0 +1,12 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+try:
+    # fairseq user dir
+    from .datasets import FairseqMMDataset
+    from .losses import FairseqCriterion
+    from .models import FairseqMMModel
+    from .tasks import FairseqMMTask
+except ImportError:
+    pass

fairseq-a54021305d6b3c4c5959ac9395135f63202db8f1/examples/MMPT/mmpt/datasets/__init__.py ADDED Viewed

	@@ -0,0 +1,10 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from .mmdataset import *
+try:
+    from .fairseqmmdataset import *
+except ImportError:
+    pass

fairseq-a54021305d6b3c4c5959ac9395135f63202db8f1/examples/MMPT/mmpt/datasets/fairseqmmdataset.py ADDED Viewed

	@@ -0,0 +1,57 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+TODO (huxu): fairseq wrapper class for all dataset you defined: mostly MMDataset.
+"""
+from collections import OrderedDict
+from torch.utils.data import Dataset
+from torch.utils.data.dataloader import default_collate
+from fairseq.data import FairseqDataset, data_utils
+class FairseqMMDataset(FairseqDataset):
+    """
+    A wrapper class for MMDataset for fairseq.
+    """
+    def __init__(self, mmdataset):
+        if not isinstance(mmdataset, Dataset):
+            raise TypeError("mmdataset must be of type `torch.utils.data.dataset`.")
+        self.mmdataset = mmdataset
+    def set_epoch(self, epoch, **unused):
+        super().set_epoch(epoch)
+        self.epoch = epoch
+    def __getitem__(self, idx):
+        with data_utils.numpy_seed(43211, self.epoch, idx):
+            return self.mmdataset[idx]
+    def __len__(self):
+        return len(self.mmdataset)
+    def collater(self, samples):
+        if hasattr(self.mmdataset, "collator"):
+            return self.mmdataset.collator(samples)
+        if len(samples) == 0:
+            return {}
+        if isinstance(samples[0], dict):
+            batch = OrderedDict()
+            for key in samples[0]:
+                if samples[0][key] is not None:
+                    batch[key] = default_collate([sample[key] for sample in samples])
+            return batch
+        else:
+            return default_collate(samples)
+    def size(self, index):
+        """dummy implementation: we don't use --max-tokens"""
+        return 1
+    def num_tokens(self, index):
+        """dummy implementation: we don't use --max-tokens"""
+        return 1

fairseq-a54021305d6b3c4c5959ac9395135f63202db8f1/examples/MMPT/mmpt/datasets/mmdataset.py ADDED Viewed

	@@ -0,0 +1,111 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+from collections import OrderedDict
+from torch.utils.data import Dataset
+from torch.utils.data.dataloader import default_collate
+from ..utils import set_seed
+class MMDataset(Dataset):
+    """
+    A generic multi-modal dataset.
+        Args:
+            `meta_processor`: a meta processor,
+                handling loading meta data and return video_id and text_id.
+            `video_processor`: a video processor,
+                handling e.g., decoding, loading .np files.
+            `text_processor`: a text processor,
+                handling e.g., tokenization.
+            `aligner`: combine the video and text feature
+                as one training example.
+    """
+    def __init__(
+        self,
+        meta_processor,
+        video_processor,
+        text_processor,
+        align_processor,
+    ):
+        self.split = meta_processor.split
+        self.meta_processor = meta_processor
+        self.video_processor = video_processor
+        self.text_processor = text_processor
+        self.align_processor = align_processor
+    def __len__(self):
+        return len(self.meta_processor)
+    def __getitem__(self, idx):
+        if self.split == "test":
+            set_seed(idx)
+        video_id, text_id = self.meta_processor[idx]
+        video_feature = self.video_processor(video_id)
+        text_feature = self.text_processor(text_id)
+        output = self.align_processor(video_id, video_feature, text_feature)
+        # TODO (huxu): the following is for debug purpose.
+        output.update({"idx": idx})
+        return output
+    def collater(self, samples):
+        """This collator is deprecated.
+        set self.collator = MMDataset.collater.
+        see collator in FairseqMMDataset.
+        """
+        if len(samples) == 0:
+            return {}
+        if isinstance(samples[0], dict):
+            batch = OrderedDict()
+            for key in samples[0]:
+                if samples[0][key] is not None:
+                    batch[key] = default_collate(
+                        [sample[key] for sample in samples])
+                # if torch.is_tensor(batch[key]):
+                #    print(key, batch[key].size())
+                # else:
+                #    print(key, len(batch[key]))
+            return batch
+        else:
+            return default_collate(samples)
+    def print_example(self, output):
+        print("[one example]", output["video_id"])
+        if (
+            hasattr(self.align_processor, "subsampling")
+            and self.align_processor.subsampling is not None
+            and self.align_processor.subsampling > 1
+        ):
+            for key in output:
+                if torch.is_tensor(output[key]):
+                    output[key] = output[key][0]
+        # search tokenizer to translate ids back.
+        tokenizer = None
+        if hasattr(self.text_processor, "tokenizer"):
+            tokenizer = self.text_processor.tokenizer
+        elif hasattr(self.align_processor, "tokenizer"):
+            tokenizer = self.align_processor.tokenizer
+        if tokenizer is not None:
+            caps = output["caps"].tolist()
+            if isinstance(caps[0], list):
+                caps = caps[0]
+            print("caps", tokenizer.decode(caps))
+            print("caps", tokenizer.convert_ids_to_tokens(caps))
+        for key, value in output.items():
+            if torch.is_tensor(value):
+                if len(value.size()) >= 3:  # attention_mask.
+                    print(key, value.size())
+                    print(key, "first", value[0, :, :])
+                    print(key, "last", value[-1, :, :])
+                else:
+                    print(key, value)
+        print("[end of one example]")

fairseq-a54021305d6b3c4c5959ac9395135f63202db8f1/examples/MMPT/mmpt/evaluators/__init__.py ADDED Viewed

	@@ -0,0 +1,13 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from .metric import *
+from .evaluator import *
+# experimental.
+try:
+    from .expmetric import *
+except ImportError:
+    pass

fairseq-a54021305d6b3c4c5959ac9395135f63202db8f1/examples/MMPT/mmpt/evaluators/evaluator.py ADDED Viewed

	@@ -0,0 +1,54 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import os
+import glob
+import numpy as np
+from . import metric as metric_path
+from . import predictor as predictor_path
+class Evaluator(object):
+    """
+    perform evaluation on a single (downstream) task.
+    make this both offline and online.
+    TODO(huxu) saving evaluation results.
+    """
+    def __init__(self, config, eval_dataloader=None):
+        if config.metric is None:
+            raise ValueError("config.metric is", config.metric)
+        metric_cls = getattr(metric_path, config.metric)
+        self.metric = metric_cls(config)
+        if config.predictor is None:
+            raise ValueError("config.predictor is", config.predictor)
+        predictor_cls = getattr(predictor_path, config.predictor)
+        self.predictor = predictor_cls(config)
+        self.eval_dataloader = eval_dataloader
+    def __call__(self):
+        try:
+            print(self.predictor.pred_dir)
+            for pred_file in glob.glob(
+                    self.predictor.pred_dir + "/*_merged.npy"):
+                outputs = np.load(pred_file)
+                results = self.metric.compute_metrics(outputs)
+                self.metric.print_computed_metrics(results)
+            outputs = np.load(os.path.join(
+                    self.predictor.pred_dir, "merged.npy"))
+            results = self.metric.compute_metrics(outputs)
+            return {"results": results, "metric": self.metric}
+        except FileNotFoundError:
+            print("\n[missing]", self.predictor.pred_dir)
+            return {}
+    def evaluate(self, model, eval_dataloader=None, output_file="merged"):
+        if eval_dataloader is None:
+            eval_dataloader = self.eval_dataloader
+        outputs = self.predictor.predict_loop(
+            model, eval_dataloader, output_file)
+        results = self.metric.compute_metrics(**outputs)
+        return results

fairseq-a54021305d6b3c4c5959ac9395135f63202db8f1/examples/MMPT/mmpt/evaluators/metric.py ADDED Viewed

	@@ -0,0 +1,313 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import numpy as np
+import json
+class Metric(object):
+    def __init__(self, config, metric_names):
+        self.metric_names = metric_names
+    def best_metric(self, metric):
+        return metric[self.metric_names[0]]
+    def save_metrics(self, fn, metrics):
+        with open(fn, "w") as fw:
+            json.dump(fw, metrics)
+    def print_computed_metrics(self, metrics):
+        raise NotImplementedError
+class RetrievalMetric(Metric):
+    """
+    this is modified from `howto100m/metrics.py`.
+    History of changes:
+    refactor as a class.
+    add metric_key in __init__
+    """
+    def __init__(self, config, metric_names=["R1", "R5", "R10", "MR"]):
+        super().__init__(config, metric_names)
+        self.error = False  # TODO(huxu): add to config to print error.
+    def compute_metrics(self, outputs, texts, **kwargs):
+        x = outputs
+        sx = np.sort(-x, axis=1)
+        d = np.diag(-x)
+        d = d[:, np.newaxis]
+        ind = sx - d
+        ind = np.where(ind == 0)
+        ind = ind[1]
+        metrics = {}
+        metrics["R1"] = float(np.sum(ind == 0)) / len(ind)
+        metrics["R5"] = float(np.sum(ind < 5)) / len(ind)
+        metrics["R10"] = float(np.sum(ind < 10)) / len(ind)
+        metrics["MR"] = np.median(ind) + 1
+        max_idx = np.argmax(outputs, axis=1)
+        if self.error:
+            # print top-20 errors.
+            error = []
+            for ex_idx in range(20):
+                error.append((texts[ex_idx], texts[max_idx[ex_idx]]))
+            metrics["error"] = error
+        return metrics
+    def print_computed_metrics(self, metrics):
+        r1 = metrics["R1"]
+        r5 = metrics["R5"]
+        r10 = metrics["R10"]
+        mr = metrics["MR"]
+        print(
+            "R@1: {:.4f} - R@5: {:.4f} - R@10: {:.4f} - Median R: {}".format(
+                r1, r5, r10, mr
+            )
+        )
+        if "error" in metrics:
+            print(metrics["error"])
+class DiDeMoMetric(Metric):
+    """
+    History of changes:
+    python 2.x to python 3.x.
+    merge utils.py into eval to save one file.
+    reference: https://github.com/LisaAnne/LocalizingMoments/blob/master/utils/eval.py
+    Code to evaluate your results on the DiDeMo dataset.
+    """
+    def __init__(self, config, metric_names=["rank1", "rank5", "miou"]):
+        super().__init__(config, metric_names)
+    def compute_metrics(self, outputs, targets, **kwargs):
+        assert len(outputs) == len(targets)
+        rank1, rank5, miou = self._eval_predictions(outputs, targets)
+        metrics = {
+            "rank1": rank1,
+            "rank5": rank5,
+            "miou": miou
+        }
+        return metrics
+    def print_computed_metrics(self, metrics):
+        rank1 = metrics["rank1"]
+        rank5 = metrics["rank5"]
+        miou = metrics["miou"]
+        # print("Average rank@1: %f" % rank1)
+        # print("Average rank@5: %f" % rank5)
+        # print("Average iou: %f" % miou)
+        print(
+            "Average rank@1: {:.4f} Average rank@5: {:.4f} Average iou: {:.4f}".format(
+                rank1, rank5, miou
+            )
+        )
+    def _iou(self, pred, gt):
+        intersection = max(0, min(pred[1], gt[1]) + 1 - max(pred[0], gt[0]))
+        union = max(pred[1], gt[1]) + 1 - min(pred[0], gt[0])
+        return float(intersection)/union
+    def _rank(self, pred, gt):
+        return pred.index(tuple(gt)) + 1
+    def _eval_predictions(self, segments, data):
+        '''
+        Inputs:
+        segments: For each item in the ground truth data, rank possible video segments given the description and video.
+            In DiDeMo, there are 21 posible moments extracted for each video so the list of video segments will be of length 21.
+            The first video segment should be the video segment that best corresponds to the text query.
+            There are 4180 sentence in the validation data, so when evaluating a model on the val dataset,
+            segments should be a list of lenght 4180, and each item in segments should be a list of length 21.
+        data: ground truth data
+        '''
+        average_ranks = []
+        average_iou = []
+        for s, d in zip(segments, data):
+            pred = s[0]
+            ious = [self._iou(pred, t) for t in d['times']]
+            average_iou.append(np.mean(np.sort(ious)[-3:]))
+            ranks = [self._rank(s, t) for t in d['times'] if tuple(t) in s]  # if t in s] is added for s, e not in prediction.
+            average_ranks.append(np.mean(np.sort(ranks)[:3]))
+        rank1 = np.sum(np.array(average_ranks) <= 1)/float(len(average_ranks))
+        rank5 = np.sum(np.array(average_ranks) <= 5)/float(len(average_ranks))
+        miou = np.mean(average_iou)
+        # print("Average rank@1: %f" % rank1)
+        # print("Average rank@5: %f" % rank5)
+        # print("Average iou: %f" % miou)
+        return rank1, rank5, miou
+class NLGMetric(Metric):
+    def __init__(
+        self,
+        config,
+        metric_names=[
+            "Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4",
+            "METEOR", "ROUGE_L", "CIDEr"
+        ]
+    ):
+        super().__init__(config, metric_names)
+        # please install NLGEval from `https://github.com/Maluuba/nlg-eval`
+        from nlgeval import NLGEval
+        self.nlg = NLGEval()
+    def compute_metrics(self, outputs, targets, **kwargs):
+        return self.nlg.compute_metrics(
+            hyp_list=outputs, ref_list=targets)
+    def print_computed_metrics(self, metrics):
+        Bleu_1 = metrics["Bleu_1"]
+        Bleu_2 = metrics["Bleu_2"]
+        Bleu_3 = metrics["Bleu_3"]
+        Bleu_4 = metrics["Bleu_4"]
+        METEOR = metrics["METEOR"]
+        ROUGE_L = metrics["ROUGE_L"]
+        CIDEr = metrics["CIDEr"]
+        print(
+            "Bleu_1: {:.4f} - Bleu_2: {:.4f} - Bleu_3: {:.4f} - Bleu_4: {:.4f} - METEOR: {:.4f} - ROUGE_L: {:.4f} - CIDEr: {:.4f}".format(
+                Bleu_1, Bleu_2, Bleu_3, Bleu_4, METEOR, ROUGE_L, CIDEr
+            )
+        )
+class QAMetric(Metric):
+    def __init__(
+        self,
+        config,
+        metric_names=["acc"]
+    ):
+        super().__init__(config, metric_names)
+    def compute_metrics(self, outputs, targets, **kwargs):
+        from sklearn.metrics import accuracy_score
+        return {"acc": accuracy_score(targets, outputs)}
+    def print_computed_metrics(self, metrics):
+        print("acc: {:.4f}".format(metrics["acc"]))
+class COINActionSegmentationMetric(Metric):
+    """
+    COIN dataset listed 3 repos for Action Segmentation.
+    Action Sets, NeuralNetwork-Viterbi, TCFPN-ISBA.
+    The first and second are the same.
+    https://github.com/alexanderrichard/action-sets/blob/master/eval.py
+    Future reference for the third:
+    `https://github.com/Zephyr-D/TCFPN-ISBA/blob/master/utils/metrics.py`
+    """
+    def __init__(self, config, metric_name=["frame_acc"]):
+        super().__init__(config, metric_name)
+    def compute_metrics(self, outputs, targets):
+        n_frames = 0
+        n_errors = 0
+        n_errors = sum(outputs != targets)
+        n_frames = len(targets)
+        return {"frame_acc": 1.0 - float(n_errors) / n_frames}
+    def print_computed_metrics(self, metrics):
+        fa = metrics["frame_acc"]
+        print("frame accuracy:", fa)
+class CrossTaskMetric(Metric):
+    def __init__(self, config, metric_names=["recall"]):
+        super().__init__(config, metric_names)
+    def compute_metrics(self, outputs, targets, **kwargs):
+        """refactored from line 166:
+        https://github.com/DmZhukov/CrossTask/blob/master/train.py"""
+        recalls = self._get_recalls(Y_true=targets, Y_pred=outputs)
+        results = {}
+        for task, rec in recalls.items():
+            results[str(task)] = rec
+        avg_recall = np.mean(list(recalls.values()))
+        results["recall"] = avg_recall
+        return results
+    def print_computed_metrics(self, metrics):
+        print('Recall: {0:0.3f}'.format(metrics["recall"]))
+        for task in metrics:
+            if task != "recall":
+                print('Task {0}. Recall = {1:0.3f}'.format(
+                    task, metrics[task]))
+    def _get_recalls(self, Y_true, Y_pred):
+        """refactored from
+        https://github.com/DmZhukov/CrossTask/blob/master/train.py"""
+        step_match = {task: 0 for task in Y_true.keys()}
+        step_total = {task: 0 for task in Y_true.keys()}
+        for task, ys_true in Y_true.items():
+            ys_pred = Y_pred[task]
+            for vid in set(ys_pred.keys()).intersection(set(ys_true.keys())):
+                y_true = ys_true[vid]
+                y_pred = ys_pred[vid]
+                step_total[task] += (y_true.sum(axis=0) > 0).sum()
+                step_match[task] += (y_true*y_pred).sum()
+        recalls = {
+            task: step_match[task] / n for task, n in step_total.items()}
+        return recalls
+class ActionRecognitionMetric(Metric):
+    def __init__(
+        self,
+        config,
+        metric_names=["acc", "acc_splits", "r1_splits", "r5_splits", "r10_splits"]
+    ):
+        super().__init__(config, metric_names)
+    def compute_metrics(self, outputs, targets, splits, **kwargs):
+        all_video_embd = outputs
+        labels = targets
+        split1, split2, split3 = splits
+        accs = []
+        r1s = []
+        r5s = []
+        r10s = []
+        for split in range(3):
+            if split == 0:
+                s = split1
+            elif split == 1:
+                s = split2
+            else:
+                s = split3
+            X_pred = all_video_embd[np.where(s == 2)[0]]
+            label_test = labels[np.where(s == 2)[0]]
+            logits = X_pred
+            X_pred = np.argmax(X_pred, axis=1)
+            acc = np.sum(X_pred == label_test) / float(len(X_pred))
+            accs.append(acc)
+            # compute recall.
+            sorted_pred = (-logits).argsort(axis=-1)
+            label_test_sp = label_test.reshape(-1, 1)
+            r1 = np.mean((sorted_pred[:, :1] == label_test_sp).sum(axis=1), axis=0)
+            r5 = np.mean((sorted_pred[:, :5] == label_test_sp).sum(axis=1), axis=0)
+            r10 = np.mean((sorted_pred[:, :10] == label_test_sp).sum(axis=1), axis=0)
+            r1s.append(r1)
+            r5s.append(r5)
+            r10s.append(r10)
+        return {"acc": accs[0], "acc_splits": accs, "r1_splits": r1s, "r5_splits": r5s, "r10_splits": r10s}
+    def print_computed_metrics(self, metrics):
+        for split, acc in enumerate(metrics["acc_splits"]):
+            print("Top 1 accuracy on split {}: {}; r1 {}; r5 {}; r10 {}".format(
+                split + 1, acc,
+                metrics["r1_splits"][split],
+                metrics["r5_splits"][split],
+                metrics["r10_splits"][split],
+                )
+            )

fairseq-a54021305d6b3c4c5959ac9395135f63202db8f1/examples/MMPT/mmpt/evaluators/predictor.py ADDED Viewed

	@@ -0,0 +1,595 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import os
+import random
+import json
+import numpy as np
+import torch
+import pickle
+import math
+from tqdm import tqdm
+class Predictor(object):
+    """this base class is used to save predictions to disk
+        (and being called by a evaluator later).
+        Predictor has minimum support of single gpu prediction.
+    """
+    def __init__(self, config):
+        self.pred_dir = None  # on-the-fly eval does not save the results.
+        if hasattr(config, "eval") and config.eval is not None:
+            self.pred_dir = config.eval.save_path
+            os.makedirs(self.pred_dir, exist_ok=True)
+    def __call__(self, outputs):
+        """extract the prediction and save it."""
+        raise NotImplementedError
+    def predict_loop(self, model, eval_dataloader, output_file=None):
+        """on-the-fly prediction on a single gpu."""
+        self.full_scores = []
+        model.eval()
+        model = model.to(0)
+        with torch.no_grad():
+            for data in eval_dataloader:
+                data = self.to_ctx(data)
+                outputs = model(**data)
+                outputs.update(data)
+                self(outputs)
+        return self.finalize(output_file)
+    def finalize(self, output_file):
+        pass
+    def to_ctx(self, data, ctx=0, dtype=None):
+        if isinstance(data, dict):
+            for key in data:
+                if torch.is_tensor(data[key]):
+                    if dtype is not None and data[key].dtype == torch.float32:
+                        data[key] = data[key].to(dtype)
+                    data[key] = data[key].to(ctx)
+            return data
+        else:
+            raise ValueError("non-dict type of batch is not supported yet.")
+class NLGPredictor(Predictor):
+    """Predicting Text from MMFusion models."""
+    """TODO: make a context."""
+    def __init__(self, config):
+        super().__init__(config)
+        from transformers import AutoTokenizer
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            config.dataset.bert_name,
+            bos_token="[CLS]", eos_token="[SEP]")
+        self.bos_token_id = self.tokenizer.bos_token_id
+        self.eos_token_id = self.tokenizer.eos_token_id
+    def predict_loop(self, model, eval_dataloader, output_file=None):
+        """TODO: refactor base classes."""
+        ctx = 0
+        outputs = {"outputs": [], "targets": [[]]}
+        model.eval()
+        model = model.to(ctx)
+        with torch.no_grad():
+            for data in tqdm(eval_dataloader):
+                data = self.to_ctx(data, ctx)
+                self(data, model, outputs)
+        return self.finalize(outputs, output_file)
+    def __call__(self, data, model, outputs):
+        data.update({
+            "bos_token_id": self.bos_token_id,
+            "eos_token_id": self.eos_token_id
+        })
+        output = model.generate(**data)
+        assert len(output) == len(data["ref"])
+        for idx, _output in enumerate(output):
+            generated_text = self.tokenizer.decode(
+                _output, skip_special_tokens=True)
+            if generated_text == "":
+                generated_text = "none"
+            outputs["outputs"].append(generated_text)
+            outputs["targets"][0].append(data["ref"][idx])
+            if random.random() < 0.001:
+                print("_output", _output)
+                print("generated_text", generated_text)
+                print("ref", data["ref"][idx])
+    def finalize(self, outputs, output_file=None):
+        if output_file is not None:
+            with open(os.path.join(
+                    self.pred_dir, output_file + ".json"), "w") as fw:
+                json.dump(outputs, fw, indent=4)
+        return outputs
+class RetrievalPredictor(Predictor):
+    """generated `pooled_video` and `pooled_text`."""
+    def __init__(self, config):
+        super().__init__(config)
+        from transformers import AutoTokenizer
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            config.dataset.bert_name)
+    def predict_loop(
+        self,
+        model,
+        eval_dataloader,
+        output_file="retrieval.npy"
+    ):
+        """on-the-fly prediction on a single gpu."""
+        full_scores = []
+        texts = []
+        model.eval()
+        model = model.cuda()
+        with torch.no_grad():
+            for data in eval_dataloader:
+                # convert to dict.
+                if not isinstance(data, dict):
+                    data = {
+                        "caps": data[0],
+                        "cmasks": data[1],
+                        "vfeats": data[2],
+                        "vmasks": data[3],
+                        "video_id": data[4]
+                    }
+                data = self.to_ctx(data)
+                outputs = model(**data)
+                outputs.update(data)
+                self(outputs, full_scores)
+                for _cap in data["caps"]:
+                    texts.append(
+                        self.tokenizer.decode(_cap, skip_special_tokens=True)
+                    )
+        return self.finalize(full_scores, texts, output_file)
+    def __call__(self, sample, full_scores):
+        scores = self._get_pooled_outputs(sample)
+        self._append_scores(scores, full_scores)
+    def finalize(self, full_scores, texts, output_file=None):
+        outputs = self._aggregate_scores(full_scores)
+        if output_file is not None:
+            np.save(os.path.join(self.pred_dir, output_file + ".npy"), outputs)
+        return {"outputs": outputs, "texts": texts}
+    def _get_pooled_outputs(self, outputs):
+        if "pooled_video" in outputs:
+            return outputs["pooled_video"], outputs["pooled_text"]
+        else:
+            raise ValueError("unknown format of outputs.")
+    def _append_scores(self, scores, full_scores):
+        assert len(scores) == 2
+        if len(full_scores) == 0:
+            full_scores.append([])
+            full_scores.append([])
+        full_scores[0].append(scores[0].cpu().detach().numpy())
+        full_scores[1].append(scores[1].cpu().detach().numpy())
+    def _aggregate_scores(self, scores):
+        assert len(scores) == 2
+        video_hidden = np.concatenate(scores[0], axis=0)
+        text_hidden = np.concatenate(scores[1], axis=0)
+        # clear up.
+        self.full_scores = []
+        return np.matmul(text_hidden, video_hidden.T)
+class QAPredictor(Predictor):
+    """generated `pooled_video` and `pooled_text`."""
+    def __init__(self, config):
+        super().__init__(config)
+        """predictor maintains scores and aggregate them."""
+    def predict_loop(self, model, eval_dataloader, output_file="qa.npy"):
+        """on-the-fly prediction on a single gpu."""
+        self.full_scores = []
+        model.eval()
+        model = model.cuda()
+        with torch.no_grad():
+            for data in eval_dataloader:
+                # reshape ans and dup video 5 times.
+                v_len = data["vfeats"].size(1)
+                hidden_size = data["vfeats"].size(2)
+                data["vfeats"] = data["vfeats"].unsqueeze(1).repeat(1, 5, 1, 1).view(-1, v_len, hidden_size)
+                data["vmasks"] = data["vmasks"].unsqueeze(1).repeat(1, 5, 1).view(-1, v_len)
+                t_len = data["caps"].size(-1)
+                data["caps"] = data["caps"].view(-1, t_len)
+                data["cmasks"] = data["cmasks"].view(-1, t_len)
+                data = self.to_ctx(data)
+                outputs = model(**data)
+                outputs.update(data)
+                self(outputs)
+        return self.finalize(output_file)
+    def __call__(self, sample):
+        hidden_size = sample["pooled_video"].size(-1)
+        pooled_video = sample["pooled_video"].view(-1, 5, hidden_size)
+        pooled_text = sample["pooled_text"].view(-1, 5, hidden_size)
+        scores = torch.bmm(pooled_video, pooled_text.transpose(2, 1))
+        scores = scores.argmax(-1)
+        self._append_scores(scores[:, 0], sample["answers"], self.full_scores)
+    def finalize(self, output_file=None):
+        outputs, targets = self._aggregate_scores(self.full_scores)
+        if output_file is not None:
+            np.save(os.path.join(self.pred_dir, output_file + ".npy"), outputs)
+        return {"outputs": outputs, "targets": targets}
+    def _append_scores(self, scores, answers, full_scores):
+        if len(full_scores) == 0:
+            full_scores.append([])
+            full_scores.append([])
+        full_scores[0].append(scores.cpu().detach().numpy())
+        full_scores[1].append(answers.cpu().detach().numpy())
+    def _aggregate_scores(self, scores):
+        assert len(scores) == 2
+        outputs = np.concatenate(scores[0], axis=0)
+        targets = np.concatenate(scores[1], axis=0)
+        # clear up.
+        self.full_scores = []
+        return outputs, targets
+class CrossTaskPredictor(Predictor):
+    """
+    CrossTaskPredictor needs to compute the average of logits
+    for overlapped sliding-window.
+    """
+    def __init__(self, config):
+        super().__init__(config)
+        self.lsm = torch.nn.LogSoftmax(dim=1)
+        self.max_video_len = config.dataset.max_video_len
+        self.sliding_window = config.dataset.sliding_window
+        self.sliding_window_size = config.dataset.sliding_window_size
+        self.annotation_path = config.dataset.annotation_path
+    def predict_loop(self, model, eval_dataloader, output_file="result.pkl"):
+        """refactored from line 144:
+        https://github.com/DmZhukov/CrossTask/blob/master/train.py
+        """
+        ctx = 0
+        model.eval()
+        model = model.to(ctx)
+        # this is not a loss but just compute neg_log_prob.
+        Y_pred = {}
+        Y_true = {}
+        with torch.no_grad():
+            for batch in eval_dataloader:
+                self(batch, model, Y_pred, Y_true)
+        return self.finalize(Y_pred, Y_true, output_file)
+    def __call__(self, sample, model, Y_pred, Y_true):
+        # please install dp from `https://github.com/DmZhukov/CrossTask`
+        from dp import dp
+        vid, task = sample['video_id'][0], sample['task'][0]
+        sample = self.to_ctx(sample)
+        # compute the average logits over sliding windows.
+        output = model(**sample)
+        batch_logits = output["logits"].cpu()
+        video_len = sample["video_len"][0]
+        # the following version is slow.
+        logits = torch.zeros((video_len, batch_logits.size(1)))
+        logits_counts = torch.zeros((video_len, 1), dtype=torch.long)
+        # use the same loop as aligner to recover.
+        batch_logit_idx = 0
+        for window_start in range(0, video_len, self.sliding_window):
+            video_end = min(video_len - window_start, self.sliding_window_size)
+            logits[window_start: window_start + video_end] += batch_logits[
+                batch_logit_idx: batch_logit_idx + video_end]
+            batch_logit_idx += video_end
+            logits_counts[window_start: window_start + video_end] += torch.ones((video_end, 1), dtype=torch.long)
+            if (video_len - window_start) <= self.sliding_window_size:
+                break
+        logits /= logits_counts
+        assert logits.size() == (video_len, batch_logits.size(1)), "{}, {}".format(logits.size(), video_len)
+        O = self.lsm(logits)
+        y = np.zeros(O.size(), dtype=np.float32)
+        dp(y, -O.detach().cpu().numpy())
+        if task not in Y_pred:
+            Y_pred[task] = {}
+        Y_pred[task][vid] = y
+        annot_path = os.path.join(
+            self.annotation_path, task+'_'+vid+'.csv')
+        if os.path.exists(annot_path):
+            if task not in Y_true:
+                Y_true[task] = {}
+            Y_true[task][vid] = self._read_assignment(
+                *y.shape, annot_path)
+    def finalize(self, Y_pred, Y_true, output_file=None):
+        if output_file is not None:
+            with open(
+                    os.path.join(self.pred_dir, output_file + ".pkl"),
+                    "wb") as fw:
+                pickle.dump(
+                    {"Y_pred": Y_pred, "Y_true": Y_true}, fw,
+                    protocol=pickle.HIGHEST_PROTOCOL)
+        return {"outputs": Y_pred, "targets": Y_true}
+    def _read_assignment(self, T, K, path):
+        """
+        refactored from https://github.com/DmZhukov/CrossTask/blob/master/data.py
+        Howto interpret contraints on loss that is going to be minimized:
+        lambd is a big number;
+        self.lambd * C is a big number for all valid position (csv stores invalids)
+        def forward(self, O, Y, C):
+            return (Y*(self.lambd * C - self.lsm(O))).mean(dim=0).sum()
+        This will load the csv file and fill-in the step col from start to end rows.
+        """
+        Y = np.zeros([T, K], dtype=np.uint8)
+        with open(path, 'r') as f:
+            for line in f:
+                step, start, end = line.strip().split(',')
+                start = int(math.floor(float(start)))
+                end = int(math.ceil(float(end)))
+                step = int(step) - 1
+                Y[start:end, step] = 1
+        return Y
+class COINPredictor(Predictor):
+    """
+    COINPredictor is similar to CrossTask on sliding windows.
+    """
+    def __init__(self, config):
+        super().__init__(config)
+        self.max_video_len = config.dataset.max_video_len
+        self.sliding_window = config.dataset.sliding_window
+        self.sliding_window_size = config.dataset.sliding_window_size
+    def predict_loop(self, model, eval_dataloader, output_file="result.pkl"):
+        """refactored from line 144:
+        https://github.com/DmZhukov/CrossTask/blob/master/train.py
+        """
+        ctx = 0
+        model.eval()
+        model = model.to(ctx)
+        # this is not a loss but just compute neg_log_prob.
+        Y_pred = []
+        Y_true = []
+        with torch.no_grad():
+            for batch in eval_dataloader:
+                self(batch, model, Y_pred, Y_true)
+        return self.finalize(Y_pred, Y_true, output_file)
+    def __call__(self, sample, model, Y_pred, Y_true):
+        sample = self.to_ctx(sample)
+        # compute the average logits over sliding windows.
+        output = model(**sample)
+        logits = self._merge_windows(sample, output)
+        Y_pred.append(logits.argmax(dim=1))
+        Y_true.append(sample["video_targets"].squeeze(0).cpu())
+    def _merge_windows(self, sample, output):
+        targets = sample["targets"].reshape(-1).cpu()
+        valid_mask = targets != -100
+        targets = targets[valid_mask]
+        batch_logits = output["logits"].cpu()
+        batch_logits = batch_logits.reshape(-1, batch_logits.size(-1))
+        batch_logits = batch_logits[valid_mask]
+        video_len = sample["video_len"][0]
+        # the following version is slow.
+        logits = torch.zeros((video_len, batch_logits.size(1)))
+        logits_counts = torch.zeros((video_len, 1), dtype=torch.long)
+        # use the same loop as aligner to recover.
+        batch_logit_idx = 0
+        for window_start in range(0, video_len, self.sliding_window):
+            video_end = min(video_len - window_start, self.sliding_window_size)
+            logits[window_start: window_start + video_end] += batch_logits[
+                batch_logit_idx: batch_logit_idx + video_end]
+            batch_logit_idx += video_end
+            logits_counts[window_start: window_start + video_end] += torch.ones((video_end, 1), dtype=torch.long)
+            if (video_len - window_start) <= self.sliding_window_size:
+                break
+        logits /= logits_counts
+        assert logits.size() == (video_len, batch_logits.size(1)), "{}, {}".format(logits.size(), video_len)
+        return logits
+    def finalize(self, Y_pred, Y_true, output_file=None):
+        Y_pred = torch.cat(Y_pred, dim=0).numpy()
+        Y_true = torch.cat(Y_true, dim=0).numpy()
+        assert len(Y_pred) == len(Y_true)
+        error_mask = Y_pred != Y_true
+        print("sample error", Y_pred[error_mask][:10], Y_true[error_mask][:10])
+        print("sample error", Y_pred[error_mask][10:20], Y_true[error_mask][10:20])
+        if output_file is not None:
+            with open(
+                    os.path.join(self.pred_dir, output_file + ".pkl"),
+                    "wb") as fw:
+                pickle.dump(
+                    {"Y_pred": Y_pred, "Y_true": Y_true}, fw,
+                    protocol=pickle.HIGHEST_PROTOCOL)
+        return {"outputs": Y_pred, "targets": Y_true}
+class COINZSPredictor(COINPredictor):
+    """
+    COINZSPredictor for COIN zero-shot prediction.
+    """
+    def __init__(self, config):
+        super().__init__(config)
+        self.dataset_config = config.dataset
+    def predict_loop(self, model, eval_dataloader, output_file="result.pkl"):
+        """refactored from line 144:
+        https://github.com/DmZhukov/CrossTask/blob/master/train.py
+        """
+        ctx = 0
+        model.eval()
+        model = model.to(ctx)
+        with torch.no_grad():
+            outputs = eval_dataloader.dataset.meta_processor.meta_text_labels(
+                self.dataset_config)
+            outputs = self.to_ctx(outputs, ctx)
+            label_hidden_states = model.forward_text(**outputs).cpu()
+            label_sim = label_hidden_states @ label_hidden_states.t()
+            num_labels = label_sim.size(0)
+            eye_mask = ~torch.eye(num_labels, dtype=torch.bool)
+            label_sim = label_sim.masked_select(eye_mask).view(num_labels, num_labels - 1)
+            lbd = label_sim.max()
+        # this is not a loss but just compute neg_log_prob.
+        Y_pred = []
+        Y_true = []
+        with torch.no_grad():
+            for batch in eval_dataloader:
+                self(batch, label_hidden_states, model, lbd, Y_pred, Y_true)
+        return self.finalize(Y_pred, Y_true, output_file)
+    def reshape_subsample(self, sample):
+        for key in sample:
+            if torch.is_tensor(sample[key]):
+                sample[key] = self.flat_subsample(sample[key])
+        return sample
+    def flat_subsample(self, tensor):
+        if len(tensor.size()) > 1 and tensor.size(0) == 1:
+            tensor = tensor.squeeze(0)
+        return tensor
+    def __call__(self, sample, label_hidden_states, model, lbd, Y_pred, Y_true):
+        sample = self.reshape_subsample(sample)
+        sample = self.to_ctx(sample)
+        # compute the average logits over sliding windows.
+        sample["output_hidden_states"] = True
+        video_outputs = model.forward_video(**sample).cpu()
+        output = {"logits": video_outputs[:, 1:sample["vmasks"].size(1)+1] @ label_hidden_states.t()}
+        logits = self._merge_windows(sample, output)
+        # logic of zero-shot for sequence labeling.
+        logits_argmax = logits.argmax(dim=1) + 1  # 0 is "O" label.
+        logits_max = logits.max(dim=1)[0]
+        pred = torch.zeros_like(logits_argmax)
+        label_select = logits_max > lbd  # 73 or 74
+        pred[label_select] = logits_argmax[label_select]
+        Y_pred.append(pred)
+        Y_true.append(sample["video_targets"].squeeze(0).cpu())
+    def finalize(self, Y_pred, Y_true, output_file=None):
+        Y_pred = torch.cat(Y_pred, dim=0).numpy()
+        Y_true = torch.cat(Y_true, dim=0).numpy()
+        assert len(Y_pred) == len(Y_true)
+        error_mask = Y_pred != Y_true
+        print("sample error", Y_pred[error_mask][:10], Y_true[error_mask][:10])
+        print("sample error", Y_pred[error_mask][10:20], Y_true[error_mask][10:20])
+        if output_file is not None:
+            with open(
+                    os.path.join(self.pred_dir, output_file + ".pkl"),
+                    "wb") as fw:
+                pickle.dump(
+                    {"Y_pred": Y_pred, "Y_true": Y_true}, fw,
+                    protocol=pickle.HIGHEST_PROTOCOL)
+        return {"outputs": Y_pred, "targets": Y_true}
+class DiDeMoPredictor(Predictor):
+    """reference: https://github.com/LisaAnne/LocalizingMoments/blob/master/utils/eval.py
+    https://github.com/LisaAnne/LocalizingMoments/blob/master/utils/data_processing.py
+    """
+    def __init__(self, config):
+        super().__init__(config)
+        # load targets.
+        with open(config.dataset.test_path) as data_file:
+            self.test_data = json.load(data_file)
+    def predict_loop(self, model, eval_dataloader, output_file="didemo.npy"):
+        """
+        TODO: two solutions here.
+        """
+        import itertools
+        # 21 chunks.
+        self.possible_segments = [(0,0), (1,1), (2,2), (3,3), (4,4), (5,5)]
+        for i in itertools.combinations(range(6), 2):
+            self.possible_segments.append(i)
+        # pick segments from a video.
+        """on-the-fly prediction on a single gpu."""
+        self.full_scores = []
+        model.eval()
+        model = model.cuda()
+        with torch.no_grad():
+            for data in eval_dataloader:
+                # TODO special forwarding logic here.
+                data = self.to_ctx(data)
+                data["output_hidden_states"] = True
+                hidden_video = model.forward_video(**data)
+                data["output_hidden_states"] = False
+                pooled_text = model.forward_text(**data)
+                outputs = {
+                    "hidden_video": hidden_video,
+                    "pooled_text": pooled_text
+                }
+                outputs.update(data)
+                self(outputs)
+        return self.finalize(output_file)
+    def __call__(self, sample):
+        # TODO: make an index select from self.possible_segments.
+        hidden_video = sample["hidden_video"]
+        pooled_text = sample["pooled_text"]
+        vmasks = sample["vmasks"]
+        # probably maintain valid results here.
+        hidden_video = hidden_video[:, 1:-1, :]
+        # probably maintain valid results here.
+        pooled_video = []
+        for s, e in self.possible_segments:
+            pooled_video.append(
+                torch.mean(
+                    hidden_video[:, int(s*5):int((e+1)*5), :],
+                    dim=1, keepdim=True)
+            )
+        pooled_video = torch.cat(pooled_video, dim=1)
+        scores = torch.bmm(
+            pooled_video, pooled_text.unsqueeze(-1)).squeeze(-1).cpu()
+        ranks = scores.argsort(dim=-1, descending=True)
+        for batch_idx, rank in enumerate(ranks):
+            rank_of_moment = []
+            for m_idx, moment in enumerate(rank):
+                s, e = self.possible_segments[moment.item()]
+                if torch.any(
+                    vmasks[batch_idx, int(s*5):int((e+1)*5)]
+                ):
+                    rank_of_moment.append((s, e))
+            self.full_scores.append(rank_of_moment)
+    def finalize(self, output_file=None):
+        outputs = self._aggregate_scores(self.full_scores)
+        if output_file is not None:
+            np.save(os.path.join(self.pred_dir, output_file + ".npy"), outputs)
+        return {"outputs": outputs, "targets": self.test_data}
+    def _aggregate_scores(self, scores):
+        self.full_scores = []
+        return scores

fairseq-a54021305d6b3c4c5959ac9395135f63202db8f1/examples/MMPT/mmpt/losses/__init__.py ADDED Viewed

	@@ -0,0 +1,16 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from .loss import *
+from .nce import *
+try:
+    from .fairseqmmloss import *
+except ImportError:
+    pass
+try:
+    from .expnce import *
+except ImportError:
+    pass

fairseq-a54021305d6b3c4c5959ac9395135f63202db8f1/examples/MMPT/mmpt/losses/fairseqmmloss.py ADDED Viewed

	@@ -0,0 +1,63 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+TODO (huxu): a general fairseq criterion for all your pre-defined losses.
+"""
+from fairseq.criterions import FairseqCriterion, register_criterion
+from fairseq import metrics
+@register_criterion("mmloss")
+class MMCriterion(FairseqCriterion):
+    def __init__(self, task):
+        super().__init__(task)
+        # TODO (huxu): wrap forward call of loss_fn and eval_fn into task.
+        self.mmtask = task.mmtask
+    def forward(self, model, sample):
+        """Compute the loss for the given sample.
+        Returns a tuple with three elements:
+        1) the loss
+        2) the sample size, which is used as the denominator for the gradient
+        3) logging outputs to display while training
+        """
+        outputs = self.mmtask(model, sample)
+        loss, loss_scalar, max_len, batch_size, sample_size = (
+            outputs["loss"],
+            outputs["loss_scalar"],
+            outputs["max_len"],
+            outputs["batch_size"],
+            outputs["sample_size"],
+        )
+        logging_output = {
+            "loss": loss_scalar,
+            "ntokens": max_len * batch_size,  # dummy report.
+            "nsentences": batch_size,  # dummy report.
+            "sample_size": sample_size,
+        }
+        return loss, 1, logging_output
+    @staticmethod
+    def reduce_metrics(logging_outputs) -> None:
+        """Aggregate logging outputs from data parallel training."""
+        """since we use NCE, our actual batch_size is 1 per GPU.
+        Then we take the mean of each worker."""
+        loss_sum = sum(log.get("loss", 0.0) for log in logging_outputs)
+        sample_size = sum(log.get("sample_size", 0) for log in logging_outputs)
+        metrics.log_scalar("loss", loss_sum / sample_size, round=3)
+    @staticmethod
+    def logging_outputs_can_be_summed() -> bool:
+        """
+        Whether the logging outputs returned by `forward` can be summed
+        across workers prior to calling `reduce_metrics`. Setting this
+        to True will improves distributed training speed.
+        """
+        return True

fairseq-a54021305d6b3c4c5959ac9395135f63202db8f1/examples/MMPT/mmpt/losses/loss.py ADDED Viewed

	@@ -0,0 +1,87 @@

+# Copyright (c) Facebook, Inc. All Rights Reserved
+import torch
+from torch import nn
+class Loss(object):
+    def __call__(self, *args, **kwargs):
+        raise NotImplementedError
+# Dummy Loss for testing.
+class DummyLoss(Loss):
+    def __init__(self):
+        self.loss = nn.CrossEntropyLoss()
+    def __call__(self, logits, targets, **kwargs):
+        return self.loss(logits, targets)
+class DummyK400Loss(Loss):
+    """dummy k400 loss for MViT."""
+    def __init__(self):
+        self.loss = nn.CrossEntropyLoss()
+    def __call__(self, logits, targets, **kwargs):
+        return self.loss(
+            logits, torch.randint(0, 400, (logits.size(0),), device=logits.device))
+class CrossEntropy(Loss):
+    def __init__(self):
+        self.loss = nn.CrossEntropyLoss()
+    def __call__(self, logits, targets, **kwargs):
+        return self.loss(logits.reshape(-1, logits.size(-1)), targets.reshape(-1))
+class ArgmaxCrossEntropy(Loss):
+    def __init__(self):
+        self.loss = nn.CrossEntropyLoss()
+    def __call__(self, logits, targets, **kwargs):
+        return self.loss(logits, targets.argmax(dim=1))
+class BCE(Loss):
+    def __init__(self):
+        self.loss = nn.BCEWithLogitsLoss()
+    def __call__(self, logits, targets, **kwargs):
+        targets = targets.squeeze(0)
+        return self.loss(logits, targets)
+class NLGLoss(Loss):
+    def __init__(self):
+        self.loss = nn.CrossEntropyLoss()
+    def __call__(self, logits, text_label, **kwargs):
+        targets = text_label[text_label != -100]
+        return self.loss(logits, targets)
+class MSE(Loss):
+    def __init__(self):
+        self.loss = nn.MSELoss()
+    def __call__(self, logits, targets, **kwargs):
+        return self.loss(logits, targets)
+class L1(Loss):
+    def __init__(self):
+        self.loss = nn.L1Loss()
+    def __call__(self, logits, targets, **kwargs):
+        return self.loss(logits, targets)
+class SmoothL1(Loss):
+    def __init__(self):
+        self.loss = nn.SmoothL1Loss()
+    def __call__(self, logits, targets, **kwargs):
+        return self.loss(logits, targets)

fairseq-a54021305d6b3c4c5959ac9395135f63202db8f1/examples/MMPT/mmpt/losses/nce.py ADDED Viewed

	@@ -0,0 +1,156 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+softmax-based NCE loss, used by this project.
+"""
+import torch
+from torch import nn
+from .loss import Loss
+class NCE(Loss):
+    def __init__(self):
+        # TODO (huxu): define temperature.
+        self.loss = nn.CrossEntropyLoss()
+    def __call__(self, align_scores, **kargs):
+        # note: we reuse the same shape as cls head in BERT (batch_size, 2)
+        # but NCE only needs one logits.
+        # (so we drop all weights in the second neg logits.)
+        align_scores = align_scores[:, :1]
+        # duplicate negative examples
+        batch_size = align_scores.size(0) // 2
+        pos_scores = align_scores[:batch_size]
+        neg_scores = align_scores[batch_size:].view(1, batch_size).repeat(
+            batch_size, 1)
+        scores = torch.cat([pos_scores, neg_scores], dim=1)
+        return self.loss(
+            scores,
+            torch.zeros(
+                (batch_size,),
+                dtype=torch.long,
+                device=align_scores.device),
+        )
+class T2VContraLoss(Loss):
+    """NCE for MM joint space, on softmax text2video matrix.
+    """
+    def __init__(self):
+        # TODO (huxu): define temperature.
+        self.loss = nn.CrossEntropyLoss()
+    def __call__(self, pooled_video, pooled_text, **kargs):
+        batch_size = pooled_video.size(0)
+        logits = torch.mm(pooled_text, pooled_video.transpose(1, 0))
+        targets = torch.arange(
+            batch_size,
+            dtype=torch.long,
+            device=pooled_video.device)
+        return self.loss(logits, targets)
+class V2TContraLoss(Loss):
+    """NCE for MM joint space, with softmax on video2text matrix."""
+    def __init__(self):
+        # TODO (huxu): define temperature.
+        self.loss = nn.CrossEntropyLoss()
+    def __call__(self, pooled_video, pooled_text, **kargs):
+        batch_size = pooled_video.size(0)
+        logits = torch.mm(pooled_video, pooled_text.transpose(1, 0))
+        targets = torch.arange(
+            batch_size,
+            dtype=torch.long,
+            device=pooled_video.device)
+        return self.loss(logits, targets)
+class MMContraLoss(Loss):
+    def __init__(self):
+        self.loss = nn.CrossEntropyLoss()
+    def __call__(self, pooled_video, pooled_text, **kwargs):
+        logits_per_video = pooled_video @ pooled_text.t()
+        logits_per_text = pooled_text @ pooled_video.t()
+        targets = torch.arange(
+            pooled_video.size(0),
+            dtype=torch.long,
+            device=pooled_video.device)
+        loss_video = self.loss(logits_per_video, targets)
+        loss_text = self.loss(logits_per_text, targets)
+        return loss_video + loss_text
+class MTM(Loss):
+    """Combination of MFM and MLM."""
+    def __init__(self):
+        self.loss = nn.CrossEntropyLoss()
+    def __call__(
+        self,
+        video_logits,
+        text_logits,
+        video_label,
+        text_label,
+        **kwargs
+    ):
+        text_logits = torch.cat([
+            text_logits,
+            torch.zeros(
+                (text_logits.size(0), 1), device=text_logits.device)
+        ], dim=1)
+        vt_logits = torch.cat([video_logits, text_logits], dim=0)
+        # loss for video.
+        video_label = torch.zeros(
+            (video_logits.size(0),),
+            dtype=torch.long,
+            device=video_logits.device
+        )
+        # loss for text.
+        text_label = text_label.reshape(-1)
+        labels_mask = text_label != -100
+        selected_text_label = text_label[labels_mask]
+        vt_label = torch.cat([video_label, selected_text_label], dim=0)
+        return self.loss(vt_logits, vt_label)
+class MFMMLM(Loss):
+    """Combination of MFM and MLM."""
+    def __init__(self):
+        self.loss = nn.CrossEntropyLoss()
+    def __call__(
+        self,
+        video_logits,
+        text_logits,
+        video_label,
+        text_label,
+        **kwargs
+    ):
+        # loss for video.
+        video_label = torch.zeros(
+            (video_logits.size(0),),
+            dtype=torch.long,
+            device=video_logits.device
+        )
+        masked_frame_loss = self.loss(video_logits, video_label)
+        # loss for text.
+        text_label = text_label.reshape(-1)
+        labels_mask = text_label != -100
+        selected_text_label = text_label[labels_mask]
+        masked_lm_loss = self.loss(text_logits, selected_text_label)
+        return masked_frame_loss + masked_lm_loss

fairseq-a54021305d6b3c4c5959ac9395135f63202db8f1/examples/MMPT/mmpt/models/__init__.py ADDED Viewed

	@@ -0,0 +1,17 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from .mmfusion import *
+from .transformermodel import *
+from .mmfusionnlg import *
+try:
+    from .fairseqmmmodel import *
+except ImportError:
+    pass
+try:
+    from .expmmfusion import *
+except ImportError:
+    pass

fairseq-a54021305d6b3c4c5959ac9395135f63202db8f1/examples/MMPT/mmpt/models/fairseqmmmodel.py ADDED Viewed

	@@ -0,0 +1,51 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from fairseq.models import (
+    BaseFairseqModel,
+    register_model,
+    register_model_architecture
+)
+@register_model("mmmodel")
+class FairseqMMModel(BaseFairseqModel):
+    """a fairseq wrapper of model built by `task`."""
+    @classmethod
+    def build_model(cls, args, task):
+        return FairseqMMModel(task.mmtask.model)
+    def __init__(self, mmmodel):
+        super().__init__()
+        self.mmmodel = mmmodel
+    def forward(self, *args, **kwargs):
+        return self.mmmodel(*args, **kwargs)
+    def upgrade_state_dict_named(self, state_dict, name):
+        super().upgrade_state_dict_named(state_dict, name)
+        keys_to_delete = []
+        for key in state_dict:
+            if key not in self.state_dict():
+                keys_to_delete.append(key)
+        for key in keys_to_delete:
+            print("[INFO]", key, "not used anymore.")
+            del state_dict[key]
+        # copy any newly defined parameters.
+        for key in self.state_dict():
+            if key not in state_dict:
+                print("[INFO] adding", key)
+                state_dict[key] = self.state_dict()[key]
+# a dummy arch, we config the model.
+@register_model_architecture("mmmodel", "mmarch")
+def mmarch(args):
+    pass

fairseq-a54021305d6b3c4c5959ac9395135f63202db8f1/examples/MMPT/mmpt/models/mmfusion.py ADDED Viewed

	@@ -0,0 +1,926 @@

+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Copyright (c) Facebook, Inc. All Rights Reserved
+import torch
+from torch import nn
+try:
+    from transformers import AutoConfig, AutoTokenizer
+except ImportError:
+    pass
+from . import transformermodel
+class MMPTModel(nn.Module):
+    """An e2e wrapper of inference model.
+    """
+    @classmethod
+    def from_pretrained(cls, config, checkpoint="checkpoint_best.pt"):
+        import os
+        from ..utils import recursive_config
+        from ..tasks import Task
+        config = recursive_config(config)
+        mmtask = Task.config_task(config)
+        checkpoint_path = os.path.join(config.eval.save_path, checkpoint)
+        mmtask.build_model(checkpoint=checkpoint_path)
+        # TODO(huxu): make the video encoder configurable.
+        from ..processors.models.s3dg import S3D
+        video_encoder = S3D('pretrained_models/s3d_dict.npy', 512)
+        video_encoder.load_state_dict(
+            torch.load('pretrained_models/s3d_howto100m.pth'))
+        from transformers import AutoTokenizer
+        tokenizer = AutoTokenizer.from_pretrained(
+            config.dataset.bert_name, use_fast=config.dataset.use_fast
+        )
+        from ..processors import Aligner
+        aligner = Aligner(config.dataset)
+        return (
+            MMPTModel(config, mmtask.model, video_encoder),
+            tokenizer,
+            aligner
+        )
+    def __init__(self, config, model, video_encoder, **kwargs):
+        super().__init__()
+        self.max_video_len = config.dataset.max_video_len
+        self.video_encoder = video_encoder
+        self.model = model
+    def forward(self, video_frames, caps, cmasks, return_score=False):
+        bsz = video_frames.size(0)
+        assert bsz == 1, "only bsz=1 is supported now."
+        seq_len = video_frames.size(1)
+        video_frames = video_frames.view(-1, *video_frames.size()[2:])
+        vfeats = self.video_encoder(video_frames.permute(0, 4, 1, 2, 3))
+        vfeats = vfeats['video_embedding']
+        vfeats = vfeats.view(bsz, seq_len, vfeats.size(-1))
+        padding = torch.zeros(
+            bsz, self.max_video_len - seq_len, vfeats.size(-1))
+        vfeats = torch.cat([vfeats, padding], dim=1)
+        vmasks = torch.cat([
+            torch.ones((bsz, seq_len), dtype=torch.bool),
+            torch.zeros((bsz, self.max_video_len - seq_len), dtype=torch.bool)
+            ],
+            dim=1
+        )
+        output = self.model(caps, cmasks, vfeats, vmasks)
+        if return_score:
+            output = {"score": torch.bmm(
+                output["pooled_video"][:, None, :],
+                output["pooled_text"][:, :, None]
+            ).squeeze(-1).squeeze(-1)}
+        return output
+class MMFusion(nn.Module):
+    """a MMPT wrapper class for MMBert style models.
+    TODO: move isolated mask to a subclass.
+    """
+    def __init__(self, config, **kwargs):
+        super().__init__()
+        transformer_config = AutoConfig.from_pretrained(
+            config.dataset.bert_name)
+        self.hidden_size = transformer_config.hidden_size
+        self.is_train = False
+        if config.dataset.train_path is not None:
+            self.is_train = True
+        # 0 means no iso; 1-12 means iso up to that layer.
+        self.num_hidden_layers = transformer_config.num_hidden_layers
+        self.last_iso_layer = 0
+        if config.dataset.num_iso_layer is not None:
+            self.last_iso_layer = config.dataset.num_iso_layer - 1 + 1
+        if config.model.mm_encoder_cls is not None:
+            mm_encoder_cls = getattr(transformermodel, config.model.mm_encoder_cls)
+            model_config = AutoConfig.from_pretrained(config.dataset.bert_name)
+            model_config.max_video_len = config.dataset.max_video_len
+            # TODO: a general way to add parameter for a model.
+            model_config.use_seg_emb = config.model.use_seg_emb
+            self.mm_encoder = mm_encoder_cls.from_pretrained(
+                config.dataset.bert_name, config=model_config)
+        elif config.model.video_encoder_cls is not None\
+                and config.model.text_encoder_cls is not None:
+            video_encoder_cls = getattr(transformermodel, config.model.video_encoder_cls)
+            model_config = AutoConfig.from_pretrained(config.dataset.bert_name)
+            model_config.max_video_len = config.dataset.max_video_len
+            # TODO: make each model a set of config class.
+            if hasattr(model_config, "num_layers"):
+                model_config.num_layers = config.model.num_hidden_video_layers
+            else:
+                model_config.num_hidden_layers = config.model.num_hidden_video_layers
+            self.video_encoder = video_encoder_cls.from_pretrained(
+                config.dataset.bert_name, config=model_config)
+            # exact same NLP model from Huggingface.
+            text_encoder_cls = getattr(transformermodel, config.model.text_encoder_cls)
+            self.text_encoder = text_encoder_cls.from_pretrained(
+                config.dataset.bert_name)
+        else:
+            raise ValueError("the encoder must be either MM or two backbones.")
+    def forward(
+        self,
+        caps,
+        cmasks,
+        vfeats,
+        vmasks,
+        **kwargs
+    ):
+        raise NotImplementedError(
+            "Please derive MMFusion module."
+        )
+    def _mm_on_the_fly(
+        self,
+        cmasks,
+        vmasks,
+        attention_mask
+    ):
+        """helper function for mask, seg_ids and token_type_ids."""
+        if attention_mask is None:
+            attention_mask = self._mm_attention_mask(cmasks, vmasks)
+        """
+        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+        | first sequence    | second sequence |
+        """
+        token_type_ids = torch.cat(
+            [
+                torch.zeros(
+                    (vmasks.size(0), vmasks.size(1) + 2),
+                    dtype=torch.long,
+                    device=vmasks.device,
+                ),
+                torch.ones(
+                    (cmasks.size(0), cmasks.size(1) - 2),
+                    dtype=torch.long,
+                    device=cmasks.device,
+                ),
+            ],
+            dim=1,
+        )
+        return attention_mask, token_type_ids
+    def _mm_attention_mask(self, cmasks, vmasks):
+        assert cmasks.size(0) == vmasks.size(0), "{}, {}, {}, {}".format(
+            str(cmasks.size()),
+            str(vmasks.size()),
+            str(cmasks.size(0)),
+            str(vmasks.size(0)),
+        )
+        mm_mask = torch.cat([cmasks[:, :1], vmasks, cmasks[:, 1:]], dim=1)
+        if self.last_iso_layer == 0:
+            # hard attention mask.
+            return mm_mask
+        else:
+            # a gpu iso mask; 0 : num_iso_layer is isolated;
+            # num_iso_layer: are MM-fused.
+            # make an iso layer
+            batch_size = cmasks.size(0)
+            iso_mask = self._make_iso_mask(batch_size, cmasks, vmasks)
+            mm_mask = mm_mask[:, None, :].repeat(1, mm_mask.size(-1), 1)
+            iso_mm_masks = []
+            # hard attention mask.
+            iso_mask = iso_mask[:, None, :, :].repeat(
+                1, self.last_iso_layer, 1, 1)
+            iso_mm_masks.append(iso_mask)
+            if self.last_iso_layer < self.num_hidden_layers:
+                mm_mask = mm_mask[:, None, :, :].repeat(
+                    1, self.num_hidden_layers - self.last_iso_layer, 1, 1
+                )
+                iso_mm_masks.append(mm_mask)
+            iso_mm_masks = torch.cat(iso_mm_masks, dim=1)
+            return iso_mm_masks
+    def _make_iso_mask(self, batch_size, cmasks, vmasks):
+        cls_self_mask = torch.cat(
+            [
+                torch.ones(
+                    (batch_size, 1), dtype=torch.bool, device=cmasks.device),
+                torch.zeros(
+                    (batch_size, cmasks.size(1) + vmasks.size(1) - 1),
+                    dtype=torch.bool, device=cmasks.device)
+            ], dim=1)
+        iso_video_mask = torch.cat(
+            [
+                # [CLS] is not used.
+                torch.zeros(
+                    (batch_size, 1), dtype=torch.bool, device=cmasks.device
+                ),
+                vmasks,
+                # assume to be 1.
+                cmasks[:, 1:2],
+                # 2 means [CLS] + [SEP]
+                torch.zeros(
+                    (batch_size, cmasks.size(1) - 2),
+                    dtype=torch.bool,
+                    device=cmasks.device,
+                ),
+            ],
+            dim=1,
+        )
+        iso_text_mask = torch.cat(
+            [
+                torch.zeros(
+                    (batch_size, 2 + vmasks.size(1)),
+                    dtype=torch.bool,
+                    device=cmasks.device,
+                ),  # [CLS] is not used.
+                cmasks[:, 2:],  # assume to be 1.
+            ],
+            dim=1,
+        )
+        cls_self_mask = cls_self_mask[:, None, :]
+        iso_video_mask = iso_video_mask[:, None, :].repeat(
+            1, vmasks.size(1) + 1, 1)
+        iso_text_mask = iso_text_mask[:, None, :].repeat(
+            1, cmasks.size(1) - 2, 1)
+        return torch.cat([cls_self_mask, iso_video_mask, iso_text_mask], dim=1)
+    def _pooling_vt_layer(
+        self,
+        layered_sequence_output,
+        cmasks,
+        vmasks
+    ):
+        layer_idx = self.last_iso_layer \
+                if self.last_iso_layer > 0 else self.num_hidden_layers
+        hidden_state = layered_sequence_output[layer_idx]
+        # also output pooled_video and pooled_text.
+        batch_size = cmasks.size(0)
+        # pool the modality.
+        text_offset = vmasks.size(1) + 2  # [CLS] + [SEP]
+        # video tokens + [SEP]
+        video_outputs = hidden_state[:, 1:text_offset]
+        video_attention_mask = torch.cat(
+            [
+                vmasks,
+                torch.ones(
+                    (batch_size, 1), dtype=torch.bool, device=vmasks.device),
+            ],
+            dim=1,
+        )
+        assert video_outputs.size(1) == video_attention_mask.size(1)
+        pooled_video = torch.sum(
+            video_outputs * video_attention_mask.unsqueeze(-1), dim=1
+        ) / video_attention_mask.sum(1, keepdim=True)
+        # pooled_video = torch.mean(video_outputs[0], dim=1)
+        # text tokens + [SEP]
+        text_attention_mask = cmasks[:, 2:]
+        text_outputs = hidden_state[:, text_offset:]
+        assert text_outputs.size(1) == text_attention_mask.size(1)
+        pooled_text = torch.sum(
+            text_outputs * text_attention_mask.unsqueeze(-1), dim=1
+        ) / text_attention_mask.sum(1, keepdim=True)
+        return pooled_video, pooled_text
+class MMFusionMFMMLM(MMFusion):
+    """forward function for MFM and MLM."""
+    def forward(
+        self,
+        caps,
+        cmasks,
+        vfeats,
+        vmasks,
+        attention_mask=None,
+        video_label=None,
+        text_label=None,
+        **kwargs
+    ):
+        output_hidden_states = False if self.is_train else True
+        target_vfeats, non_masked_frame_mask = None, None
+        if video_label is not None:
+            target_vfeats = vfeats.masked_select(
+                video_label.unsqueeze(-1)).view(
+                -1, vfeats.size(-1)
+            )
+            # mask video token.
+            vfeats[video_label] = 0.0
+            non_masked_frame_mask = vmasks.clone()
+            non_masked_frame_mask[video_label] = False
+        attention_mask, token_type_ids = self._mm_on_the_fly(
+            cmasks, vmasks, attention_mask)
+        outputs = self.mm_encoder(
+            input_ids=caps,
+            input_video_embeds=vfeats,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            masked_frame_labels=video_label,
+            target_video_hidden_states=target_vfeats,
+            non_masked_frame_mask=non_masked_frame_mask,
+            masked_lm_labels=text_label,
+            output_hidden_states=output_hidden_states,
+        )
+        video_logits, text_logits = outputs[0], outputs[1]
+        if self.is_train:  # return earlier for training.
+            return {
+                "video_logits": video_logits,
+                "text_logits": text_logits,
+            }
+        pooled_video, pooled_text = self._pooling_vt_layer(
+            outputs[2], cmasks, vmasks)
+        return {"pooled_video": pooled_video, "pooled_text": pooled_text}
+class MMFusionMTM(MMFusionMFMMLM):
+    def __init__(self, config, **kwargs):
+        super().__init__(config)
+        """
+        For reproducibility:
+        self.mm_encoder will be initialized then discarded.
+        """
+        from .transformermodel import MMBertForMTM
+        model_config = AutoConfig.from_pretrained(config.dataset.bert_name)
+        model_config.max_video_len = config.dataset.max_video_len
+        model_config.use_seg_emb = config.model.use_seg_emb
+        self.mm_encoder = MMBertForMTM.from_pretrained(
+            config.dataset.bert_name, config=model_config)
+class MMFusionShare(MMFusion):
+    """A retrival wrapper using mm_encoder as both video/text backbone.
+    TODO: move formally.
+    """
+    def forward(
+        self,
+        caps,
+        cmasks,
+        vfeats,
+        vmasks,
+        attention_mask=None,
+        video_label=None,
+        text_label=None,
+        output_hidden_states=False,
+        **kwargs
+    ):
+        pooled_video = self.forward_video(
+            vfeats,
+            vmasks,
+            caps,
+            cmasks,
+            output_hidden_states
+        )
+        pooled_text = self.forward_text(
+            caps,
+            cmasks,
+            output_hidden_states
+        )
+        return {"pooled_video": pooled_video, "pooled_text": pooled_text}
+    def forward_video(
+        self,
+        vfeats,
+        vmasks,
+        caps,
+        cmasks,
+        output_hidden_states=False,
+        **kwargs
+    ):
+        input_ids = caps[:, :2]
+        attention_mask = torch.cat([
+            cmasks[:, :1],
+            vmasks,
+            cmasks[:, 1:2]
+        ], dim=1)
+        token_type_ids = torch.zeros(
+            (vmasks.size(0), vmasks.size(1) + 2),
+            dtype=torch.long,
+            device=vmasks.device)
+        outputs = self.mm_encoder(
+            input_ids=input_ids,
+            input_video_embeds=vfeats,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            output_hidden_states=True
+        )
+        video_outputs = outputs[0]
+        if output_hidden_states:
+            return video_outputs
+        batch_size = cmasks.size(0)
+        video_attention_mask = torch.cat(
+            [
+                torch.zeros(
+                    (batch_size, 1), dtype=torch.bool, device=vmasks.device),
+                vmasks,
+                torch.ones(
+                    (batch_size, 1), dtype=torch.bool, device=vmasks.device),
+            ],
+            dim=1,
+        )
+        assert video_outputs.size(1) == video_attention_mask.size(1)
+        video_attention_mask = video_attention_mask.type(video_outputs.dtype) \
+            / video_attention_mask.sum(1, keepdim=True)
+        pooled_video = torch.bmm(
+            video_outputs.transpose(2, 1),
+            video_attention_mask.unsqueeze(2)
+        ).squeeze(-1)
+        return pooled_video  # video_outputs
+    def forward_text(
+        self,
+        caps,
+        cmasks,
+        output_hidden_states=False,
+        **kwargs
+    ):
+        input_ids = torch.cat([
+            caps[:, :1], caps[:, 2:],
+            ], dim=1)
+        attention_mask = torch.cat([
+            cmasks[:, :1],
+            cmasks[:, 2:]
+        ], dim=1)
+        token_type_ids = torch.cat([
+            torch.zeros(
+                (cmasks.size(0), 1),
+                dtype=torch.long,
+                device=cmasks.device),
+            torch.ones(
+                (cmasks.size(0), cmasks.size(1) - 2),
+                dtype=torch.long,
+                device=cmasks.device)
+            ], dim=1)
+        outputs = self.mm_encoder(
+            input_ids=input_ids,
+            input_video_embeds=None,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            output_hidden_states=True
+        )
+        text_outputs = outputs[0]
+        if output_hidden_states:
+            return text_outputs
+        batch_size = caps.size(0)
+        # text tokens + [SEP]
+        text_attention_mask = torch.cat([
+            torch.zeros(
+                (batch_size, 1), dtype=torch.bool, device=cmasks.device),
+            cmasks[:, 2:]
+        ], dim=1)
+        assert text_outputs.size(1) == text_attention_mask.size(1)
+        text_attention_mask = text_attention_mask.type(text_outputs.dtype) \
+            / text_attention_mask.sum(1, keepdim=True)
+        pooled_text = torch.bmm(
+            text_outputs.transpose(2, 1),
+            text_attention_mask.unsqueeze(2)
+        ).squeeze(-1)
+        return pooled_text  # text_outputs
+class MMFusionSeparate(MMFusionShare):
+    def forward_video(
+        self,
+        vfeats,
+        vmasks,
+        caps,
+        cmasks,
+        output_hidden_states=False,
+        **kwargs
+    ):
+        input_ids = caps[:, :2]
+        attention_mask = torch.cat([
+            cmasks[:, :1],
+            vmasks,
+            cmasks[:, 1:2]
+        ], dim=1)
+        token_type_ids = torch.zeros(
+            (vmasks.size(0), vmasks.size(1) + 2),
+            dtype=torch.long,
+            device=vmasks.device)
+        outputs = self.video_encoder(
+            input_ids=input_ids,
+            input_video_embeds=vfeats,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            output_hidden_states=True
+        )
+        video_outputs = outputs[0]
+        if output_hidden_states:
+            return video_outputs
+        batch_size = cmasks.size(0)
+        video_attention_mask = torch.cat(
+            [
+                torch.zeros(
+                    (batch_size, 1), dtype=torch.bool, device=vmasks.device),
+                vmasks,
+                torch.ones(
+                    (batch_size, 1), dtype=torch.bool, device=vmasks.device),
+            ],
+            dim=1,
+        )
+        assert video_outputs.size(1) == video_attention_mask.size(1)
+        video_attention_mask = video_attention_mask.type(video_outputs.dtype) \
+            / video_attention_mask.sum(1, keepdim=True)
+        pooled_video = torch.bmm(
+            video_outputs.transpose(2, 1),
+            video_attention_mask.unsqueeze(2)
+        ).squeeze(-1)
+        return pooled_video  # video_outputs
+    def forward_text(
+        self,
+        caps,
+        cmasks,
+        output_hidden_states=False,
+        **kwargs
+    ):
+        input_ids = torch.cat([
+            caps[:, :1], caps[:, 2:],
+            ], dim=1)
+        attention_mask = torch.cat([
+            cmasks[:, :1],
+            cmasks[:, 2:]
+        ], dim=1)
+        # different from sharing, we use all-0 type.
+        token_type_ids = torch.zeros(
+            (cmasks.size(0), cmasks.size(1) - 1),
+            dtype=torch.long,
+            device=cmasks.device)
+        outputs = self.text_encoder(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            output_hidden_states=True
+        )
+        text_outputs = outputs[0]
+        if output_hidden_states:
+            return text_outputs
+        batch_size = caps.size(0)
+        # text tokens + [SEP]
+        text_attention_mask = torch.cat([
+            torch.zeros(
+                (batch_size, 1), dtype=torch.bool, device=cmasks.device),
+            cmasks[:, 2:]
+        ], dim=1)
+        assert text_outputs.size(1) == text_attention_mask.size(1)
+        text_attention_mask = text_attention_mask.type(text_outputs.dtype) \
+            / text_attention_mask.sum(1, keepdim=True)
+        pooled_text = torch.bmm(
+            text_outputs.transpose(2, 1),
+            text_attention_mask.unsqueeze(2)
+        ).squeeze(-1)
+        return pooled_text  # text_outputs
+class MMFusionJoint(MMFusion):
+    """fine-tuning wrapper for retrival task."""
+    def forward(
+        self,
+        caps,
+        cmasks,
+        vfeats,
+        vmasks,
+        attention_mask=None,
+        video_label=None,
+        text_label=None,
+        **kwargs
+    ):
+        # TODO (huxu): other ways to do negative examples; move the following
+        # into your criterion forward.
+        output_hidden_states = True
+        attention_mask, token_type_ids = self._mm_on_the_fly(
+            cmasks, vmasks, attention_mask)
+        separate_forward_split = (
+            None if self.is_train else vmasks.size(1) + 2
+        )  # [CLS] + [SEP]
+        outputs = self.mm_encoder(
+            input_ids=caps,
+            input_video_embeds=vfeats,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            output_hidden_states=output_hidden_states,
+            separate_forward_split=separate_forward_split,
+        )
+        pooled_video, pooled_text = self._pooling_vt_layer(
+            outputs[2], cmasks, vmasks)
+        return {"pooled_video": pooled_video, "pooled_text": pooled_text}
+class MMFusionActionSegmentation(MMFusion):
+    """Fine-tuning wrapper for action segmentation.
+    TODO: rename this for VLM.
+    """
+    def forward(
+        self,
+        caps,
+        cmasks,
+        vfeats,
+        vmasks,
+        attention_mask=None,
+        **kwargs
+    ):
+        # ActionLocalization assume of batch_size=1, squeeze it.
+        caps = caps.view(-1, caps.size(-1))
+        cmasks = cmasks.view(-1, cmasks.size(-1))
+        vfeats = vfeats.view(-1, vfeats.size(2), vfeats.size(3))
+        vmasks = vmasks.view(-1, vmasks.size(-1))
+        # this may not cover all shapes of attention_mask.
+        attention_mask = attention_mask.view(
+            -1, attention_mask.size(2), attention_mask.size(3)) \
+            if attention_mask is not None else None
+        # TODO (huxu): other ways to do negative examples; move the following
+        # into your criterion forward.
+        output_hidden_states = True
+        #  video forwarding, text is dummy; never use attention_mask.
+        attention_mask, token_type_ids = self._mm_on_the_fly(
+            cmasks, vmasks, attention_mask)
+        logits = self.mm_encoder(
+            input_ids=caps,
+            input_video_embeds=vfeats,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            output_hidden_states=output_hidden_states,
+        )
+        return {"logits": logits[0][:, 1:vmasks.size(1)+1]}
+class MMFusionActionLocalization(MMFusion):
+    """fine-tuning model for retrival task."""
+    def __init__(self, config, **kwargs):
+        super().__init__(config)
+        tokenizer = AutoTokenizer.from_pretrained(
+            config.dataset.bert_name)
+        self.cls_token_id = tokenizer.cls_token_id
+        self.sep_token_id = tokenizer.sep_token_id
+        self.pad_token_id = tokenizer.pad_token_id
+    def forward(
+        self,
+        caps,
+        cmasks,
+        vfeats,
+        vmasks,
+        attention_mask=None,
+        **kwargs
+    ):
+        # ActionLocalization assume of batch_size=1, squeeze it.
+        caps = caps.squeeze(0)
+        cmasks = cmasks.squeeze(0)
+        vfeats = vfeats.squeeze(0)
+        vmasks = vmasks.squeeze(0)
+        attention_mask = attention_mask.squeeze(0) if attention_mask is not None else None
+        # TODO (huxu): other ways to do negative examples; move the following
+        # into your criterion forward.
+        output_hidden_states = True
+        # a len1 dummy video token.
+        dummy_vfeats = torch.zeros(
+            (caps.size(0), 1, vfeats.size(-1)), device=vfeats.device, dtype=vfeats.dtype)
+        dummy_vmasks = torch.ones(
+            (caps.size(0), 1), dtype=torch.bool,
+            device=vfeats.device)
+        dummy_caps = torch.LongTensor(
+            [[self.cls_token_id, self.sep_token_id,
+              self.pad_token_id, self.sep_token_id]],
+            ).to(caps.device).repeat(vfeats.size(0), 1)
+        dummy_cmasks = torch.BoolTensor(
+            [[0, 1, 0, 1]]  # pad are valid for attention.
+            ).to(caps.device).repeat(vfeats.size(0), 1)
+        #  video forwarding, text is dummy; never use attention_mask.
+        attention_mask, token_type_ids = self._mm_on_the_fly(
+            dummy_cmasks, vmasks, None)
+        outputs = self.mm_encoder(
+            input_ids=dummy_caps,
+            input_video_embeds=vfeats,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            output_hidden_states=output_hidden_states,
+        )
+        layer_idx = self.last_iso_layer \
+                if self.last_iso_layer > 0 else self.num_hidden_layers
+        video_seq = outputs[2][layer_idx][:, 1:vmasks.size(1)+1].masked_select(
+                vmasks.unsqueeze(-1)
+            ).view(-1, self.hidden_size)
+        # text forwarding, video is dummy
+        attention_mask, token_type_ids = self._mm_on_the_fly(
+            cmasks, dummy_vmasks, None)
+        outputs = self.mm_encoder(
+            input_ids=caps,
+            input_video_embeds=dummy_vfeats,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            output_hidden_states=output_hidden_states,
+        )
+        _, pooled_text = self._pooling_vt_layer(
+            outputs[2], cmasks, dummy_vmasks)
+        # this line is not right.
+        logits = torch.mm(video_seq, pooled_text.transpose(1, 0))
+        return {"logits": logits}
+# --------------- MMFusionSeparate for end tasks ---------------
+class MMFusionSeparateActionSegmentation(MMFusionSeparate):
+    """Fine-tuning wrapper for action segmentation."""
+    def forward(
+        self,
+        caps,
+        cmasks,
+        vfeats,
+        vmasks,
+        attention_mask=None,
+        **kwargs
+    ):
+        # ActionLocalization assume of batch_size=1, squeeze it.
+        caps = caps.view(-1, caps.size(-1))
+        cmasks = cmasks.view(-1, cmasks.size(-1))
+        vfeats = vfeats.view(-1, vfeats.size(2), vfeats.size(3))
+        vmasks = vmasks.view(-1, vmasks.size(-1))
+        logits = self.forward_video(
+            vfeats,
+            vmasks,
+            caps,
+            cmasks,
+            output_hidden_states=True
+        )
+        return {"logits": logits[:, 1:vmasks.size(1)+1]}
+class MMFusionSeparateActionLocalization(MMFusionSeparate):
+    def __init__(self, config, **kwargs):
+        super().__init__(config)
+        tokenizer = AutoTokenizer.from_pretrained(
+            config.dataset.bert_name)
+        self.cls_token_id = tokenizer.cls_token_id
+        self.sep_token_id = tokenizer.sep_token_id
+        self.pad_token_id = tokenizer.pad_token_id
+    def forward(
+        self,
+        caps,
+        cmasks,
+        vfeats,
+        vmasks,
+        **kwargs
+    ):
+        # ActionLocalization assume of batch_size=1, squeeze it.
+        caps = caps.squeeze(0)
+        cmasks = cmasks.squeeze(0)
+        vfeats = vfeats.squeeze(0)
+        vmasks = vmasks.squeeze(0)
+        # TODO (huxu): other ways to do negative examples; move the following
+        # into your criterion forward.
+        dummy_caps = torch.LongTensor(
+            [[self.cls_token_id, self.sep_token_id,
+              self.pad_token_id, self.sep_token_id]],
+            ).to(caps.device).repeat(vfeats.size(0), 1)
+        dummy_cmasks = torch.BoolTensor(
+            [[0, 1, 0, 1]]  # pad are valid for attention.
+            ).to(caps.device).repeat(vfeats.size(0), 1)
+        outputs = self.forward_video(
+            vfeats,
+            vmasks,
+            dummy_caps,
+            dummy_cmasks,
+            output_hidden_states=True
+        )
+        video_seq = outputs[:, 1:vmasks.size(1)+1].masked_select(
+                vmasks.unsqueeze(-1)
+            ).view(-1, self.hidden_size)
+        pooled_text = self.forward_text(
+            caps,
+            cmasks,
+            output_hidden_states=False
+        )
+        # this line is not right.
+        logits = torch.mm(video_seq, pooled_text.transpose(1, 0))
+        return {"logits": logits}
+class MMFusionShareActionLocalization(MMFusionShare):
+    def __init__(self, config, **kwargs):
+        super().__init__(config)
+        tokenizer = AutoTokenizer.from_pretrained(
+            config.dataset.bert_name)
+        self.cls_token_id = tokenizer.cls_token_id
+        self.sep_token_id = tokenizer.sep_token_id
+        self.pad_token_id = tokenizer.pad_token_id
+    def forward(
+        self,
+        caps,
+        cmasks,
+        vfeats,
+        vmasks,
+        **kwargs
+    ):
+        # ActionLocalization assume of batch_size=1, squeeze it.
+        caps = caps.squeeze(0)
+        cmasks = cmasks.squeeze(0)
+        vfeats = vfeats.squeeze(0)
+        vmasks = vmasks.squeeze(0)
+        # TODO (huxu): other ways to do negative examples; move the following
+        # into your criterion forward.
+        dummy_caps = torch.LongTensor(
+            [[self.cls_token_id, self.sep_token_id,
+              self.pad_token_id, self.sep_token_id]],
+            ).to(caps.device).repeat(vfeats.size(0), 1)
+        dummy_cmasks = torch.BoolTensor(
+            [[0, 1, 0, 1]]  # pad are valid for attention.
+            ).to(caps.device).repeat(vfeats.size(0), 1)
+        outputs = self.forward_video(
+            vfeats,
+            vmasks,
+            dummy_caps,
+            dummy_cmasks,
+            output_hidden_states=True
+        )
+        video_seq = outputs[:, 1:vmasks.size(1)+1].masked_select(
+                vmasks.unsqueeze(-1)
+            ).view(-1, self.hidden_size)
+        pooled_text = self.forward_text(
+            caps,
+            cmasks,
+            output_hidden_states=False
+        )
+        # this line is not right.
+        logits = torch.mm(video_seq, pooled_text.transpose(1, 0))
+        return {"logits": logits}

fairseq-a54021305d6b3c4c5959ac9395135f63202db8f1/examples/MMPT/mmpt/models/mmfusionnlg.py ADDED Viewed

	@@ -0,0 +1,999 @@

+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors, Facebook AI Research authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Copyright (c) Facebook, Inc. All Rights Reserved
+import torch
+from torch.nn import functional as F
+from typing import Optional, Iterable
+try:
+    from transformers import BertPreTrainedModel
+    from transformers.modeling_bert import BertOnlyMLMHead
+    from transformers.file_utils import ModelOutput
+    from transformers.modeling_outputs import CausalLMOutput
+    from transformers.generation_utils import (
+        BeamHypotheses,
+        top_k_top_p_filtering
+    )
+except ImportError:
+    pass
+from .mmfusion import MMFusion
+from .transformermodel import MMBertModel
+from ..modules import VideoTokenMLP
+class MMFusionNLG(MMFusion):
+    def __init__(self, config, **kwargs):
+        super().__init__(config)
+        if config.model.max_decode_length is not None:
+            self.max_length = min(
+                config.model.max_decode_length,
+                config.dataset.max_len - config.dataset.max_video_len - 3
+            )
+        else:
+            self.max_length = \
+                config.dataset.max_len - config.dataset.max_video_len - 3
+        self.gen_param = config.gen_param if config.gen_param is not None \
+            else {}
+    def forward(
+        self,
+        caps,
+        cmasks,
+        vfeats,
+        vmasks,
+        attention_mask,
+        video_label=None,
+        text_label=None,
+        **kwargs
+    ):
+        """use pre-trained LM header for generation."""
+        attention_mask, token_type_ids = self._mm_on_the_fly(
+            cmasks, vmasks, attention_mask)
+        outputs = self.mm_encoder(
+            input_ids=caps,
+            input_video_embeds=vfeats,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            masked_lm_labels=text_label,
+        )
+        return {"logits": outputs[0]}
+    @torch.no_grad()
+    def generate(
+        self,
+        caps, cmasks, vfeats, vmasks,
+        attention_mask=None,
+        bos_token_id=None,
+        eos_token_id=None,
+        **kwargs
+    ):
+        # a simplified interface from
+        # https://huggingface.co/transformers/v3.4.0/_modules/transformers/generation_utils.html#GenerationMixin.generate
+        # caps now only have
+        # [CLS], [SEP] (for video) and [CLS] (as bos_token)
+        assert caps.size(1) == 3
+        attention_mask, token_type_ids = self._mm_on_the_fly(
+            cmasks, vmasks, attention_mask)
+        output = self.mm_encoder.generate(
+            input_ids=caps,
+            input_video_embeds=vfeats,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            max_length=self.max_length,
+            **self.gen_param
+        )
+        return output
+class MMBertForNLG(BertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.bert = MMBertModel(config)
+        self.videomlp = VideoTokenMLP(config)
+        # we do not use `BertGenerationOnlyLMHead`
+        # because we can reuse pretraining.
+        self.cls = BertOnlyMLMHead(config)
+        self.hidden_size = config.hidden_size
+        self.init_weights()
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+    def forward(
+        self,
+        input_ids=None,
+        input_video_embeds=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        masked_lm_labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        # similar to MMBertForMFMMLM without MFM.
+        video_tokens = self.videomlp(input_video_embeds)
+        outputs = self.bert(
+            input_ids,
+            video_tokens,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+        prediction_scores = None
+        if masked_lm_labels is not None:
+            text_offset = input_video_embeds.size(1) + 1  # [CLS]
+            # recover caps format: [CLS] [SEP] text [SEP]
+            text_sequence_output = torch.cat(
+                [sequence_output[:, :1], sequence_output[:, text_offset:]],
+                dim=1
+            )
+            # only compute select tokens to training to speed up.
+            hidden_size = text_sequence_output.size(-1)
+            # masked_lm_labels = masked_lm_labels.reshape(-1)
+            labels_mask = masked_lm_labels != -100
+            selected_text_output = text_sequence_output.masked_select(
+                labels_mask.unsqueeze(-1)
+            ).view(-1, hidden_size)
+            prediction_scores = self.cls(selected_text_output)
+        if not return_dict:
+            output = (
+                prediction_scores,
+            ) + outputs[2:]
+            return output
+        # for generation.
+        text_offset = input_video_embeds.size(1) + 2  # [CLS]
+        text_sequence_output = sequence_output[:, text_offset:]
+        prediction_scores = self.cls(text_sequence_output)
+        return CausalLMOutput(
+            loss=None,
+            logits=prediction_scores,
+        )
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        input_video_embeds,
+        attention_mask=None,
+        token_type_ids=None,
+        **model_kwargs
+    ):
+        # must return a dictionary.
+        seq_len = input_ids.size(1) + input_video_embeds.size(1)
+        if attention_mask is not None:
+            if len(attention_mask.size()) == 4:
+                attention_mask = attention_mask[:, :, :seq_len, :seq_len]
+            elif len(attention_mask.size()) == 3:
+                attention_mask = attention_mask[:, :seq_len, :seq_len]
+            else:
+                attention_mask = attention_mask[:, :seq_len]
+        if token_type_ids is not None:
+            token_type_ids = token_type_ids[:, :seq_len]
+        return {
+            "input_ids": input_ids,
+            "input_video_embeds": input_video_embeds,
+            "attention_mask": attention_mask,
+            "token_type_ids": token_type_ids,
+        }
+    @torch.no_grad()
+    def generate(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        max_length: Optional[int] = None,
+        min_length: Optional[int] = None,
+        do_sample: Optional[bool] = None,
+        early_stopping: Optional[bool] = None,
+        num_beams: Optional[int] = None,
+        temperature: Optional[float] = None,
+        top_k: Optional[int] = None,
+        top_p: Optional[float] = None,
+        repetition_penalty: Optional[float] = None,
+        bad_words_ids: Optional[Iterable[int]] = None,
+        bos_token_id: Optional[int] = None,
+        pad_token_id: Optional[int] = None,
+        eos_token_id: Optional[int] = None,
+        length_penalty: Optional[float] = None,
+        no_repeat_ngram_size: Optional[int] = None,
+        num_return_sequences: Optional[int] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+        decoder_start_token_id: Optional[int] = None,
+        use_cache: Optional[bool] = None,
+        **model_kwargs
+    ) -> torch.LongTensor:
+        r"""
+        Generates sequences for models with a language modeling head. The method currently supports greedy decoding,
+        beam-search decoding, sampling with temperature, sampling with top-k or nucleus sampling.
+        Adapted in part from `Facebook's XLM beam search code
+        <https://github.com/facebookresearch/XLM/blob/9e6f6814d17be4fe5b15f2e6c43eb2b2d76daeb4/src/model/transformer.py#L529>`__.
+        Apart from :obj:`input_ids` and :obj:`attention_mask`, all the arguments below will default to the value of the
+        attribute of the same name inside the :class:`~transformers.PretrainedConfig` of the model. The default values
+        indicated are the default values of those config.
+        Most of these parameters are explained in more detail in `this blog post
+        <https://huggingface.co/blog/how-to-generate>`__.
+        Parameters:
+            input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+                The sequence used as a prompt for the generation. If :obj:`None` the method initializes
+                it as an empty :obj:`torch.LongTensor` of shape :obj:`(1,)`.
+            decoder_input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+                initial input_ids for the decoder of encoder-decoder type models. If :obj:`None` then only
+                decoder_start_token_id is passed as the first token to the decoder.
+            max_length (:obj:`int`, `optional`, defaults to 20):
+                The maximum length of the sequence to be generated.
+            min_length (:obj:`int`, `optional`, defaults to 10):
+                The minimum length of the sequence to be generated.
+            do_sample (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Whether or not to use sampling ; use greedy decoding otherwise.
+            early_stopping (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Whether to stop the beam search when at least ``num_beams`` sentences are finished per batch or not.
+            num_beams (:obj:`int`, `optional`, defaults to 1):
+                Number of beams for beam search. 1 means no beam search.
+            temperature (:obj:`float`, `optional`, defaults tp 1.0):
+                The value used to module the next token probabilities.
+            top_k (:obj:`int`, `optional`, defaults to 50):
+                The number of highest probability vocabulary tokens to keep for top-k-filtering.
+            top_p (:obj:`float`, `optional`, defaults to 1.0):
+                If set to float < 1, only the most probable tokens with probabilities that add up to ``top_p`` or
+                higher are kept for generation.
+            repetition_penalty (:obj:`float`, `optional`, defaults to 1.0):
+                The parameter for repetition penalty. 1.0 means no penalty. See `this paper
+                <https://arxiv.org/pdf/1909.05858.pdf>`__ for more details.
+            pad_token_id (:obj:`int`, `optional`):
+                The id of the `padding` token.
+            bos_token_id (:obj:`int`, `optional`):
+                The id of the `beginning-of-sequence` token.
+            eos_token_id (:obj:`int`, `optional`):
+                The id of the `end-of-sequence` token.
+            length_penalty (:obj:`float`, `optional`, defaults to 1.0):
+                Exponential penalty to the length. 1.0 means no penalty.
+                Set to values < 1.0 in order to encourage the model to generate shorter sequences, to a value > 1.0 in
+                order to encourage the model to produce longer sequences.
+            no_repeat_ngram_size (:obj:`int`, `optional`, defaults to 0):
+                If set to int > 0, all ngrams of that size can only occur once.
+            bad_words_ids(:obj:`List[int]`, `optional`):
+                List of token ids that are not allowed to be generated. In order to get the tokens of the words that
+                should not appear in the generated text, use :obj:`tokenizer.encode(bad_word, add_prefix_space=True)`.
+            num_return_sequences(:obj:`int`, `optional`, defaults to 1):
+                The number of independently computed returned sequences for each element in the batch.
+            attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+                Mask to avoid performing attention on padding token indices. Mask values are in ``[0, 1]``, 1 for
+                tokens that are not masked, and 0 for masked tokens.
+                If not provided, will default to a tensor the same shape as :obj:`input_ids` that masks the pad token.
+                `What are attention masks? <../glossary.html#attention-mask>`__
+            decoder_start_token_id (:obj:`int`, `optional`):
+                If an encoder-decoder model starts decoding with a different token than `bos`, the id of that token.
+            use_cache: (:obj:`bool`, `optional`, defaults to :obj:`True`):
+                Whether or not the model should use the past last key/values attentions (if applicable to the model) to
+                speed up decoding.
+            model_kwargs:
+                Additional model specific kwargs will be forwarded to the :obj:`forward` function of the model.
+        Return:
+            :obj:`torch.LongTensor` of shape :obj:`(batch_size * num_return_sequences, sequence_length)`:
+            The generated sequences. The second dimension (sequence_length) is either equal to :obj:`max_length` or
+            shorter if all batches finished early due to the :obj:`eos_token_id`.
+        Examples::
+            tokenizer = AutoTokenizer.from_pretrained('distilgpt2')   # Initialize tokenizer
+            model = AutoModelWithLMHead.from_pretrained('distilgpt2')    # Download model and configuration from S3 and cache.
+            outputs = model.generate(max_length=40)  # do greedy decoding
+            print('Generated: {}'.format(tokenizer.decode(outputs[0], skip_special_tokens=True)))
+            tokenizer = AutoTokenizer.from_pretrained('openai-gpt')   # Initialize tokenizer
+            model = AutoModelWithLMHead.from_pretrained('openai-gpt')    # Download model and configuration from S3 and cache.
+            input_context = 'The dog'
+            input_ids = tokenizer.encode(input_context, return_tensors='pt')  # encode input context
+            outputs = model.generate(input_ids=input_ids, num_beams=5, num_return_sequences=3, temperature=1.5)  # generate 3 independent sequences using beam search decoding (5 beams) with sampling from initial context 'The dog'
+            for i in range(3): #  3 output sequences were generated
+                print('Generated {}: {}'.format(i, tokenizer.decode(outputs[i], skip_special_tokens=True)))
+            tokenizer = AutoTokenizer.from_pretrained('distilgpt2')   # Initialize tokenizer
+            model = AutoModelWithLMHead.from_pretrained('distilgpt2')    # Download model and configuration from S3 and cache.
+            input_context = 'The dog'
+            input_ids = tokenizer.encode(input_context, return_tensors='pt')  # encode input context
+            outputs = model.generate(input_ids=input_ids, max_length=40, temperature=0.7, num_return_sequences=3, do_sample=True)  # generate 3 candidates using sampling
+            for i in range(3): #  3 output sequences were generated
+                print('Generated {}: {}'.format(i, tokenizer.decode(outputs[i], skip_special_tokens=True)))
+            tokenizer = AutoTokenizer.from_pretrained('ctrl')   # Initialize tokenizer
+            model = AutoModelWithLMHead.from_pretrained('ctrl')    # Download model and configuration from S3 and cache.
+            input_context = 'Legal My neighbor is'  # "Legal" is one of the control codes for ctrl
+            input_ids = tokenizer.encode(input_context, return_tensors='pt')  # encode input context
+            outputs = model.generate(input_ids=input_ids, max_length=50, temperature=0.7, repetition_penalty=1.2)  # generate sequences
+            print('Generated: {}'.format(tokenizer.decode(outputs[0], skip_special_tokens=True)))
+            tokenizer = AutoTokenizer.from_pretrained('gpt2')   # Initialize tokenizer
+            model = AutoModelWithLMHead.from_pretrained('gpt2')    # Download model and configuration from S3 and cache.
+            input_context = 'My cute dog'  # "Legal" is one of the control codes for ctrl
+            bad_words_ids = [tokenizer.encode(bad_word, add_prefix_space=True) for bad_word in ['idiot', 'stupid', 'shut up']]
+            input_ids = tokenizer.encode(input_context, return_tensors='pt')  # encode input context
+            outputs = model.generate(input_ids=input_ids, max_length=100, do_sample=True, bad_words_ids=bad_words_ids)  # generate sequences without allowing bad_words to be generated
+        """
+        # We cannot generate if the model does not have a LM head
+        if self.get_output_embeddings() is None:
+            raise AttributeError(
+                "You tried to generate sequences with a model that does not have a LM Head."
+                "Please use another model class (e.g. `OpenAIGPTLMHeadModel`, `XLNetLMHeadModel`, `GPT2LMHeadModel`, `CTRLLMHeadModel`, `T5WithLMHeadModel`, `TransfoXLLMHeadModel`, `XLMWithLMHeadModel`, `BartForConditionalGeneration` )"
+            )
+        max_length = max_length if max_length is not None else self.config.max_length
+        min_length = min_length if min_length is not None else self.config.min_length
+        do_sample = do_sample if do_sample is not None else self.config.do_sample
+        early_stopping = early_stopping if early_stopping is not None else self.config.early_stopping
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        num_beams = num_beams if num_beams is not None else self.config.num_beams
+        temperature = temperature if temperature is not None else self.config.temperature
+        top_k = top_k if top_k is not None else self.config.top_k
+        top_p = top_p if top_p is not None else self.config.top_p
+        repetition_penalty = repetition_penalty if repetition_penalty is not None else self.config.repetition_penalty
+        bos_token_id = bos_token_id if bos_token_id is not None else self.config.bos_token_id
+        pad_token_id = pad_token_id if pad_token_id is not None else self.config.pad_token_id
+        eos_token_id = eos_token_id if eos_token_id is not None else self.config.eos_token_id
+        length_penalty = length_penalty if length_penalty is not None else self.config.length_penalty
+        no_repeat_ngram_size = (
+            no_repeat_ngram_size if no_repeat_ngram_size is not None else self.config.no_repeat_ngram_size
+        )
+        bad_words_ids = bad_words_ids if bad_words_ids is not None else self.config.bad_words_ids
+        num_return_sequences = (
+            num_return_sequences if num_return_sequences is not None else self.config.num_return_sequences
+        )
+        decoder_start_token_id = (
+            decoder_start_token_id if decoder_start_token_id is not None else self.config.decoder_start_token_id
+        )
+        if input_ids is not None:
+            batch_size = input_ids.shape[0]  # overriden by the input batch_size
+        else:
+            batch_size = 1
+        assert isinstance(max_length, int) and max_length > 0, "`max_length` should be a strictly positive integer."
+        assert isinstance(min_length, int) and min_length >= 0, "`min_length` should be a positive integer."
+        assert isinstance(do_sample, bool), "`do_sample` should be a boolean."
+        assert isinstance(early_stopping, bool), "`early_stopping` should be a boolean."
+        assert isinstance(use_cache, bool), "`use_cache` should be a boolean."
+        assert isinstance(num_beams, int) and num_beams > 0, "`num_beams` should be a strictly positive integer."
+        assert temperature > 0, "`temperature` should be strictly positive."
+        assert isinstance(top_k, int) and top_k >= 0, "`top_k` should be a positive integer."
+        assert 0 <= top_p <= 1, "`top_p` should be between 0 and 1."
+        assert repetition_penalty >= 1.0, "`repetition_penalty` should be >= 1."
+        assert input_ids is not None or (
+            isinstance(bos_token_id, int) and bos_token_id >= 0
+        ), "If input_ids is not defined, `bos_token_id` should be a positive integer."
+        assert pad_token_id is None or (
+            isinstance(pad_token_id, int) and (pad_token_id >= 0)
+        ), "`pad_token_id` should be a positive integer."
+        assert (eos_token_id is None) or (
+            isinstance(eos_token_id, int) and (eos_token_id >= 0)
+        ), "`eos_token_id` should be a positive integer."
+        assert length_penalty > 0, "`length_penalty` should be strictly positive."
+        assert (
+            isinstance(no_repeat_ngram_size, int) and no_repeat_ngram_size >= 0
+        ), "`no_repeat_ngram_size` should be a positive integer."
+        assert (
+            isinstance(num_return_sequences, int) and num_return_sequences > 0
+        ), "`num_return_sequences` should be a strictly positive integer."
+        assert (
+            bad_words_ids is None or isinstance(bad_words_ids, list) and isinstance(bad_words_ids[0], list)
+        ), "`bad_words_ids` is either `None` or a list of lists of tokens that should not be generated"
+        if input_ids is None:
+            assert isinstance(bos_token_id, int) and bos_token_id >= 0, (
+                "you should either supply a context to complete as `input_ids` input "
+                "or a `bos_token_id` (integer >= 0) as a first token to start the generation."
+            )
+            input_ids = torch.full(
+                (batch_size, 1),
+                bos_token_id,
+                dtype=torch.long,
+                device=next(self.parameters()).device,
+            )
+        else:
+            assert input_ids.dim() == 2, "Input prompt should be of shape (batch_size, sequence length)."
+        # not allow to duplicate outputs when greedy decoding
+        if do_sample is False:
+            if num_beams == 1:
+                # no_beam_search greedy generation conditions
+                assert (
+                    num_return_sequences == 1
+                ), "Greedy decoding will always produce the same output for num_beams == 1 and num_return_sequences > 1. Please set num_return_sequences = 1"
+            else:
+                # beam_search greedy generation conditions
+                assert (
+                    num_beams >= num_return_sequences
+                ), "Greedy beam search decoding cannot return more sequences than it has beams. Please set num_beams >= num_return_sequences"
+        # create attention mask if necessary
+        # TODO (PVP): this should later be handled by the forward fn() in each model in the future see PR 3140
+        if (attention_mask is None) and (pad_token_id is not None) and (pad_token_id in input_ids):
+            attention_mask = input_ids.ne(pad_token_id).long()
+        elif attention_mask is None:
+            attention_mask = input_ids.new_ones(input_ids.shape)
+        # set pad_token_id to eos_token_id if not set. Important that this is done after
+        # attention_mask is created
+        if pad_token_id is None and eos_token_id is not None:
+            print(
+                "Setting `pad_token_id` to {} (first `eos_token_id`) to generate sequence".format(eos_token_id)
+            )
+            pad_token_id = eos_token_id
+        # vocab size
+        if hasattr(self.config, "vocab_size"):
+            vocab_size = self.config.vocab_size
+        elif (
+            self.config.is_encoder_decoder
+            and hasattr(self.config, "decoder")
+            and hasattr(self.config.decoder, "vocab_size")
+        ):
+            vocab_size = self.config.decoder.vocab_size
+        else:
+            raise ValueError("either self.config.vocab_size or self.config.decoder.vocab_size needs to be defined")
+        # set effective batch size and effective batch multiplier according to do_sample
+        if do_sample:
+            effective_batch_size = batch_size * num_return_sequences
+            effective_batch_mult = num_return_sequences
+        else:
+            effective_batch_size = batch_size
+            effective_batch_mult = 1
+        if self.config.is_encoder_decoder:
+            if decoder_start_token_id is None:
+                # see if BOS token can be used for decoder_start_token_id
+                if bos_token_id is not None:
+                    decoder_start_token_id = bos_token_id
+                elif (
+                    hasattr(self.config, "decoder")
+                    and hasattr(self.config.decoder, "bos_token_id")
+                    and self.config.decoder.bos_token_id is not None
+                ):
+                    decoder_start_token_id = self.config.decoder.bos_token_id
+                else:
+                    raise ValueError(
+                        "decoder_start_token_id or bos_token_id has to be defined for encoder-decoder generation"
+                    )
+            assert hasattr(self, "get_encoder"), "{} should have a 'get_encoder' function defined".format(self)
+            assert callable(self.get_encoder), "{} should be a method".format(self.get_encoder)
+            # get encoder and store encoder outputs
+            encoder = self.get_encoder()
+            encoder_outputs: ModelOutput = encoder(input_ids, attention_mask=attention_mask, return_dict=True)
+        # Expand input ids if num_beams > 1 or num_return_sequences > 1
+        if num_return_sequences > 1 or num_beams > 1:
+            # TODO: make this a call-back function.
+            # input_ids=caps,
+            # input_video_embeds=vfeats,
+            # attention_mask=attention_mask,
+            # token_type_ids=token_type_ids,
+            input_video_embeds = model_kwargs.pop("input_video_embeds", None)
+            token_type_ids = model_kwargs.pop("token_type_ids", None)
+            input_ids_len = input_ids.shape[-1]
+            input_ids = input_ids.unsqueeze(1).expand(
+                 batch_size, effective_batch_mult * num_beams, input_ids_len)
+            input_video_embeds_len, input_video_embeds_hidden = input_video_embeds.size(1), input_video_embeds.size(2)
+            input_video_embeds = input_video_embeds.unsqueeze(1).expand(
+                batch_size, effective_batch_mult * num_beams, input_video_embeds_len, input_video_embeds_hidden)
+            attention_mask_from_len, attention_mask_to_len = attention_mask.size(1), attention_mask.size(2)
+            attention_mask = attention_mask.unsqueeze(1).expand(
+                batch_size, effective_batch_mult * num_beams, attention_mask_from_len, attention_mask_to_len
+            )
+            token_type_ids_len = token_type_ids.size(1)
+            token_type_ids = token_type_ids.unsqueeze(1).expand(
+                batch_size, effective_batch_mult * num_beams, token_type_ids_len
+            )
+            # contiguous ...
+            input_ids = input_ids.contiguous().view(
+                effective_batch_size * num_beams, input_ids_len
+            )  # shape: (batch_size * num_return_sequences * num_beams, cur_len)
+            input_video_embeds = input_video_embeds.contiguous().view(
+                effective_batch_size * num_beams, input_video_embeds_len, input_video_embeds_hidden)
+            attention_mask = attention_mask.contiguous().view(
+                effective_batch_size * num_beams, attention_mask_from_len, attention_mask_to_len
+            )  # shape: (batch_size * num_return_sequences * num_beams, cur_len)
+            token_type_ids = token_type_ids.contiguous().view(
+                effective_batch_size * num_beams, token_type_ids_len
+            )
+            model_kwargs["input_video_embeds"] = input_video_embeds
+            model_kwargs["token_type_ids"] = token_type_ids
+        if self.config.is_encoder_decoder:
+            device = next(self.parameters()).device
+            if decoder_input_ids is not None:
+                # give initial decoder input ids
+                input_ids = decoder_input_ids.repeat(effective_batch_size * num_beams, 1).to(device)
+            else:
+                # create empty decoder input_ids
+                input_ids = torch.full(
+                    (effective_batch_size * num_beams, 1),
+                    decoder_start_token_id,
+                    dtype=torch.long,
+                    device=device,
+                )
+            cur_len = input_ids.shape[-1]
+            assert (
+                batch_size == encoder_outputs.last_hidden_state.shape[0]
+            ), f"expected encoder_outputs.last_hidden_state to have 1st dimension bs={batch_size}, got {encoder_outputs.last_hidden_state.shape[0]} "
+            # expand batch_idx to assign correct encoder output for expanded input_ids (due to num_beams > 1 and num_return_sequences > 1)
+            expanded_batch_idxs = (
+                torch.arange(batch_size)
+                .view(-1, 1)
+                .repeat(1, num_beams * effective_batch_mult)
+                .view(-1)
+                .to(input_ids.device)
+            )
+            # expand encoder_outputs
+            encoder_outputs["last_hidden_state"] = encoder_outputs.last_hidden_state.index_select(
+                0, expanded_batch_idxs
+            )
+            # save encoder_outputs in `model_kwargs`
+            model_kwargs["encoder_outputs"] = encoder_outputs
+        else:
+            cur_len = input_ids.shape[-1]
+        assert (
+            cur_len < max_length
+        ), f"The context has {cur_len} number of tokens, but `max_length` is only {max_length}. Please make sure that `max_length` is bigger than the number of tokens, by setting either `generate(max_length=...,...)` or `config.max_length = ...`"
+        if num_beams > 1:
+            output = self._generate_beam_search(
+                input_ids,
+                cur_len=cur_len,
+                max_length=max_length,
+                min_length=min_length,
+                do_sample=do_sample,
+                early_stopping=early_stopping,
+                temperature=temperature,
+                top_k=top_k,
+                top_p=top_p,
+                repetition_penalty=repetition_penalty,
+                no_repeat_ngram_size=no_repeat_ngram_size,
+                bad_words_ids=bad_words_ids,
+                pad_token_id=pad_token_id,
+                eos_token_id=eos_token_id,
+                batch_size=effective_batch_size,
+                num_return_sequences=num_return_sequences,
+                length_penalty=length_penalty,
+                num_beams=num_beams,
+                vocab_size=vocab_size,
+                attention_mask=attention_mask,
+                use_cache=use_cache,
+                model_kwargs=model_kwargs,
+            )
+        else:
+            output = self._generate_no_beam_search(
+                input_ids,
+                cur_len=cur_len,
+                max_length=max_length,
+                min_length=min_length,
+                do_sample=do_sample,
+                temperature=temperature,
+                top_k=top_k,
+                top_p=top_p,
+                repetition_penalty=repetition_penalty,
+                no_repeat_ngram_size=no_repeat_ngram_size,
+                bad_words_ids=bad_words_ids,
+                pad_token_id=pad_token_id,
+                eos_token_id=eos_token_id,
+                batch_size=effective_batch_size,
+                attention_mask=attention_mask,
+                use_cache=use_cache,
+                model_kwargs=model_kwargs,
+            )
+        return output
+    def _generate_beam_search(
+        self,
+        input_ids,
+        cur_len,
+        max_length,
+        min_length,
+        do_sample,
+        early_stopping,
+        temperature,
+        top_k,
+        top_p,
+        repetition_penalty,
+        no_repeat_ngram_size,
+        bad_words_ids,
+        pad_token_id,
+        eos_token_id,
+        batch_size,
+        num_return_sequences,
+        length_penalty,
+        num_beams,
+        vocab_size,
+        attention_mask,
+        use_cache,
+        model_kwargs,
+    ):
+        """Generate sequences for each example with beam search."""
+        # generated hypotheses
+        generated_hyps = [
+            BeamHypotheses(num_beams, max_length, length_penalty, early_stopping=early_stopping)
+            for _ in range(batch_size)
+        ]
+        # scores for each sentence in the beam
+        beam_scores = torch.zeros((batch_size, num_beams), dtype=torch.float, device=input_ids.device)
+        # for greedy decoding it is made sure that only tokens of the first beam are considered to avoid sampling the exact same tokens three times
+        if do_sample is False:
+            beam_scores[:, 1:] = -1e9
+        beam_scores = beam_scores.view(-1)  # shape (batch_size * num_beams,)
+        # cache compute states
+        past = None
+        # done sentences
+        done = [False for _ in range(batch_size)]
+        while cur_len < max_length:
+            model_inputs = self.prepare_inputs_for_generation(
+                input_ids, past=past, attention_mask=attention_mask, use_cache=use_cache, **model_kwargs
+            )
+            outputs = self(**model_inputs, return_dict=True)  # (batch_size * num_beams, cur_len, vocab_size)
+            next_token_logits = outputs.logits[:, -1, :]  # (batch_size * num_beams, vocab_size)
+            # if model has past, then set the past variable to speed up decoding
+            if "past_key_values" in outputs:
+                past = outputs.past_key_values
+            elif "mems" in outputs:
+                past = outputs.mems
+            if self.config.is_encoder_decoder and do_sample is False:
+                # TODO (PVP) still a bit hacky here - there might be a better solution
+                next_token_logits = self.adjust_logits_during_generation(
+                    next_token_logits, cur_len=cur_len, max_length=max_length
+                )
+            scores = F.log_softmax(next_token_logits, dim=-1)  # (batch_size * num_beams, vocab_size)
+            scores = self.postprocess_next_token_scores(
+                scores=scores,
+                input_ids=input_ids,
+                no_repeat_ngram_size=no_repeat_ngram_size,
+                bad_words_ids=bad_words_ids,
+                cur_len=cur_len,
+                min_length=min_length,
+                max_length=max_length,
+                eos_token_id=eos_token_id,
+                repetition_penalty=repetition_penalty,
+                batch_size=batch_size,
+                num_beams=num_beams,
+            )
+            assert scores.shape == (batch_size * num_beams, vocab_size), "Shapes of scores: {} != {}".format(
+                scores.shape, (batch_size * num_beams, vocab_size)
+            )
+            if do_sample:
+                _scores = scores + beam_scores[:, None].expand_as(scores)  # (batch_size * num_beams, vocab_size)
+                # Temperature
+                if temperature != 1.0:
+                    _scores = _scores / temperature
+                # Top-p/top-k filtering
+                _scores = top_k_top_p_filtering(
+                    _scores, top_k=top_k, top_p=top_p, min_tokens_to_keep=2
+                )  # (batch_size * num_beams, vocab_size)
+                # re-organize to group the beam together to sample from all beam_idxs
+                _scores = _scores.contiguous().view(
+                    batch_size, num_beams * vocab_size
+                )  # (batch_size, num_beams * vocab_size)
+                # Sample 2 next tokens for each beam (so we have some spare tokens and match output of greedy beam search)
+                probs = F.softmax(_scores, dim=-1)
+                next_tokens = torch.multinomial(probs, num_samples=2 * num_beams)  # (batch_size, num_beams * 2)
+                # Compute next scores
+                next_scores = torch.gather(_scores, -1, next_tokens)  # (batch_size, num_beams * 2)
+                # sort the sampled vector to make sure that the first num_beams samples are the best
+                next_scores, next_scores_indices = torch.sort(next_scores, descending=True, dim=1)
+                next_tokens = torch.gather(next_tokens, -1, next_scores_indices)  # (batch_size, num_beams * 2)
+            else:
+                next_scores = scores + beam_scores[:, None].expand_as(scores)  # (batch_size * num_beams, vocab_size)
+                # re-organize to group the beam together (we are keeping top hypothesis accross beams)
+                next_scores = next_scores.view(
+                    batch_size, num_beams * vocab_size
+                )  # (batch_size, num_beams * vocab_size)
+                next_scores, next_tokens = torch.topk(next_scores, 2 * num_beams, dim=1, largest=True, sorted=True)
+            assert next_scores.size() == next_tokens.size() == (batch_size, 2 * num_beams)
+            # next batch beam content
+            next_batch_beam = []
+            # for each sentence
+            for batch_idx in range(batch_size):
+                # if we are done with this sentence, add a pad token
+                if done[batch_idx]:
+                    assert (
+                        len(generated_hyps[batch_idx]) >= num_beams
+                    ), "Batch can only be done if at least {} beams have been generated".format(num_beams)
+                    assert (
+                        eos_token_id is not None and pad_token_id is not None
+                    ), "generated beams >= num_beams -> eos_token_id and pad_token have to be defined"
+                    next_batch_beam.extend([(0, pad_token_id, 0)] * num_beams)  # pad the batch
+                    continue
+                # next sentence beam content, this will get added to next_batch_beam
+                next_sent_beam = []
+                # next tokens for this sentence
+                for beam_token_rank, (beam_token_id, beam_token_score) in enumerate(
+                    zip(next_tokens[batch_idx], next_scores[batch_idx])
+                ):
+                    # get beam and token IDs
+                    beam_id = beam_token_id // vocab_size
+                    token_id = beam_token_id % vocab_size
+                    effective_beam_id = batch_idx * num_beams + beam_id
+                    # add to generated hypotheses if end of sentence
+                    if (eos_token_id is not None) and (token_id.item() == eos_token_id):
+                        # if beam_token does not belong to top num_beams tokens, it should not be added
+                        is_beam_token_worse_than_top_num_beams = beam_token_rank >= num_beams
+                        if is_beam_token_worse_than_top_num_beams:
+                            continue
+                        generated_hyps[batch_idx].add(
+                            input_ids[effective_beam_id].clone(),
+                            beam_token_score.item(),
+                        )
+                    else:
+                        # add next predicted token since it is not eos_token
+                        next_sent_beam.append((beam_token_score, token_id, effective_beam_id))
+                    # once the beam for next step is full, don't add more tokens to it.
+                    if len(next_sent_beam) == num_beams:
+                        break
+                # Check if we are done so that we can save a pad step if all(done)
+                done[batch_idx] = done[batch_idx] or generated_hyps[batch_idx].is_done(
+                    next_scores[batch_idx].max().item(), cur_len
+                )
+                # update next beam content
+                assert len(next_sent_beam) == num_beams, "Beam should always be full"
+                next_batch_beam.extend(next_sent_beam)
+                assert len(next_batch_beam) == num_beams * (batch_idx + 1), "We should have added num_beams each step"
+            # stop when we are done with each sentence
+            if all(done):
+                break
+            # sanity check / prepare next batch
+            assert len(next_batch_beam) == batch_size * num_beams
+            beam_scores = beam_scores.new([x[0] for x in next_batch_beam])
+            beam_tokens = input_ids.new([x[1] for x in next_batch_beam])
+            beam_idx = input_ids.new([x[2] for x in next_batch_beam])
+            # re-order batch and update current length
+            input_ids = input_ids[beam_idx, :]
+            input_ids = torch.cat([input_ids, beam_tokens.unsqueeze(1)], dim=-1)
+            cur_len = cur_len + 1
+            # re-order internal states
+            if past is not None:
+                past = self._reorder_cache(past, beam_idx)
+            # extend attention_mask for new generated input if only decoder
+            # (huxu): move out since we trim attention_mask by ourselves.
+            # if self.config.is_encoder_decoder is False:
+            #    attention_mask = torch.cat(
+            #        [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1
+            #    )
+        # finalize all open beam hypotheses and add to generated hypotheses
+        for batch_idx in range(batch_size):
+            if done[batch_idx]:
+                continue
+            # test that beam scores match previously calculated scores if not eos and batch_idx not done
+            if eos_token_id is not None and all(
+                (token_id % vocab_size).item() != eos_token_id for token_id in next_tokens[batch_idx]
+            ):
+                assert torch.all(
+                    next_scores[batch_idx, :num_beams] == beam_scores.view(batch_size, num_beams)[batch_idx]
+                ), "If batch_idx is not done, final next scores: {} have to equal to accumulated beam_scores: {}".format(
+                    next_scores[:, :num_beams][batch_idx],
+                    beam_scores.view(batch_size, num_beams)[batch_idx],
+                )
+            # need to add best num_beams hypotheses to generated hyps
+            for beam_id in range(num_beams):
+                effective_beam_id = batch_idx * num_beams + beam_id
+                final_score = beam_scores[effective_beam_id].item()
+                final_tokens = input_ids[effective_beam_id]
+                generated_hyps[batch_idx].add(final_tokens, final_score)
+        # depending on whether greedy generation is wanted or not define different output_batch_size and output_num_return_sequences_per_batch
+        output_batch_size = batch_size if do_sample else batch_size * num_return_sequences
+        output_num_return_sequences_per_batch = 1 if do_sample else num_return_sequences
+        # select the best hypotheses
+        sent_lengths = input_ids.new(output_batch_size)
+        best = []
+        # retrieve best hypotheses
+        for i, hypotheses in enumerate(generated_hyps):
+            sorted_hyps = sorted(hypotheses.beams, key=lambda x: x[0])
+            for j in range(output_num_return_sequences_per_batch):
+                effective_batch_idx = output_num_return_sequences_per_batch * i + j
+                best_hyp = sorted_hyps.pop()[1]
+                sent_lengths[effective_batch_idx] = len(best_hyp)
+                best.append(best_hyp)
+        # prepare for adding eos
+        sent_max_len = min(sent_lengths.max().item() + 1, max_length)
+        decoded = input_ids.new(output_batch_size, sent_max_len)
+        # shorter batches are padded if needed
+        if sent_lengths.min().item() != sent_lengths.max().item():
+            assert pad_token_id is not None, "`pad_token_id` has to be defined"
+            decoded.fill_(pad_token_id)
+        # fill with hypotheses and eos_token_id if the latter fits in
+        for i, hypo in enumerate(best):
+            decoded[i, : sent_lengths[i]] = hypo
+            if sent_lengths[i] < max_length:
+                decoded[i, sent_lengths[i]] = eos_token_id
+        return decoded
+    def _generate_no_beam_search(
+        self,
+        input_ids,
+        cur_len,
+        max_length,
+        min_length,
+        do_sample,
+        temperature,
+        top_k,
+        top_p,
+        repetition_penalty,
+        no_repeat_ngram_size,
+        bad_words_ids,
+        pad_token_id,
+        eos_token_id,
+        batch_size,
+        attention_mask,
+        use_cache,
+        model_kwargs,
+    ):
+        """Generate sequences for each example without beam search (num_beams == 1).
+        All returned sequence are generated independantly.
+        """
+        # length of generated sentences / unfinished sentences
+        unfinished_sents = input_ids.new(batch_size).fill_(1)
+        sent_lengths = input_ids.new(batch_size).fill_(max_length)
+        past = None
+        while cur_len < max_length:
+            model_inputs = self.prepare_inputs_for_generation(
+                input_ids, past=past, attention_mask=attention_mask, use_cache=use_cache, **model_kwargs
+            )
+            outputs = self(**model_inputs, return_dict=True)
+            next_token_logits = outputs.logits[:, -1, :]
+            scores = self.postprocess_next_token_scores(
+                scores=next_token_logits,
+                input_ids=input_ids,
+                no_repeat_ngram_size=no_repeat_ngram_size,
+                bad_words_ids=bad_words_ids,
+                cur_len=cur_len,
+                min_length=min_length,
+                max_length=max_length,
+                eos_token_id=eos_token_id,
+                repetition_penalty=repetition_penalty,
+                batch_size=batch_size,
+                num_beams=1,
+            )
+            # if model has past, then set the past variable to speed up decoding
+            if "past_key_values" in outputs:
+                past = outputs.past_key_values
+            elif "mems" in outputs:
+                past = outputs.mems
+            if do_sample:
+                # Temperature (higher temperature => more likely to sample low probability tokens)
+                if temperature != 1.0:
+                    scores = scores / temperature
+                # Top-p/top-k filtering
+                next_token_logscores = top_k_top_p_filtering(scores, top_k=top_k, top_p=top_p)
+                # Sample
+                probs = F.softmax(next_token_logscores, dim=-1)
+                next_token = torch.multinomial(probs, num_samples=1).squeeze(1)
+            else:
+                # Greedy decoding
+                next_token = torch.argmax(next_token_logits, dim=-1)
+                # print(next_token_logits[0,next_token[0]], next_token_logits[0,eos_token_id])
+            # update generations and finished sentences
+            if eos_token_id is not None:
+                # pad finished sentences if eos_token_id exist
+                tokens_to_add = next_token * unfinished_sents + (pad_token_id) * (1 - unfinished_sents)
+            else:
+                tokens_to_add = next_token
+            # add token and increase length by one
+            input_ids = torch.cat([input_ids, tokens_to_add.unsqueeze(-1)], dim=-1)
+            cur_len = cur_len + 1
+            if eos_token_id is not None:
+                eos_in_sents = tokens_to_add == eos_token_id
+                # if sentence is unfinished and the token to add is eos, sent_lengths is filled with current length
+                is_sents_unfinished_and_token_to_add_is_eos = unfinished_sents.mul(eos_in_sents.long()).bool()
+                sent_lengths.masked_fill_(is_sents_unfinished_and_token_to_add_is_eos, cur_len)
+                # unfinished_sents is set to zero if eos in sentence
+                unfinished_sents.mul_((~eos_in_sents).long())
+            # stop when there is a </s> in each sentence, or if we exceed the maximul length
+            if unfinished_sents.max() == 0:
+                break
+            # extend attention_mask for new generated input if only decoder
+            # if self.config.is_encoder_decoder is False:
+            #     attention_mask = torch.cat(
+            #         [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1
+            #     )
+        return input_ids

fairseq-a54021305d6b3c4c5959ac9395135f63202db8f1/examples/MMPT/mmpt/models/transformermodel.py ADDED Viewed

	@@ -0,0 +1,734 @@

+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Copyright (c) Facebook, Inc. All Rights Reserved
+import torch
+from torch import nn
+try:
+    from transformers.modeling_bert import (
+        BertPreTrainedModel,
+        BertModel,
+        BertEncoder,
+        BertPredictionHeadTransform,
+    )
+except ImportError:
+    pass
+from ..modules import VideoTokenMLP, MMBertEmbeddings
+# --------------- fine-tuning models ---------------
+class MMBertForJoint(BertPreTrainedModel):
+    """A BertModel with isolated attention mask to separate modality."""
+    def __init__(self, config):
+        super().__init__(config)
+        self.videomlp = VideoTokenMLP(config)
+        self.bert = MMBertModel(config)
+        self.init_weights()
+    def forward(
+        self,
+        input_ids=None,
+        input_video_embeds=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        next_sentence_label=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        separate_forward_split=None,
+    ):
+        return_dict = (
+            return_dict if return_dict is not None
+            else self.config.use_return_dict
+        )
+        video_tokens = self.videomlp(input_video_embeds)
+        outputs = self.bert(
+            input_ids,
+            video_tokens,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            separate_forward_split=separate_forward_split,
+        )
+        return outputs
+class MMBertForTokenClassification(BertPreTrainedModel):
+    """A BertModel similar to MMJointUni, with extra wrapper layer
+    to be fine-tuned from other pretrained MMFusion model."""
+    def __init__(self, config):
+        super().__init__(config)
+        self.videomlp = VideoTokenMLP(config)
+        self.bert = MMBertModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        # TODO(huxu): 779 is the number of classes for COIN: move to config?
+        self.classifier = nn.Linear(config.hidden_size, 779)
+        self.init_weights()
+    def forward(
+        self,
+        input_ids=None,
+        input_video_embeds=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        next_sentence_label=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        separate_forward_split=None,
+    ):
+        return_dict = (
+            return_dict if return_dict is not None
+            else self.config.use_return_dict
+        )
+        video_tokens = self.videomlp(input_video_embeds)
+        outputs = self.bert(
+            input_ids,
+            video_tokens,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            separate_forward_split=separate_forward_split,
+        )
+        return (self.classifier(outputs[0]),)
+# ------------ pre-training models ----------------
+class MMBertForEncoder(BertPreTrainedModel):
+    """A BertModel for Contrastive Learning."""
+    def __init__(self, config):
+        super().__init__(config)
+        self.videomlp = VideoTokenMLP(config)
+        self.bert = MMBertModel(config)
+        self.init_weights()
+    def forward(
+        self,
+        input_ids=None,
+        input_video_embeds=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        return_dict = (
+            return_dict if return_dict is not None
+            else self.config.use_return_dict
+        )
+        if input_video_embeds is not None:
+            video_tokens = self.videomlp(input_video_embeds)
+        else:
+            video_tokens = None
+        outputs = self.bert(
+            input_ids,
+            video_tokens,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        return outputs
+class MMBertForMFMMLM(BertPreTrainedModel):
+    """A BertModel with shared prediction head on MFM-MLM."""
+    def __init__(self, config):
+        super().__init__(config)
+        self.videomlp = VideoTokenMLP(config)
+        self.bert = MMBertModel(config)
+        self.cls = MFMMLMHead(config)
+        self.hidden_size = config.hidden_size
+        self.init_weights()
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+    def forward(
+        self,
+        input_ids=None,
+        input_video_embeds=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        masked_frame_labels=None,
+        target_video_hidden_states=None,
+        non_masked_frame_mask=None,
+        masked_lm_labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        return_dict = (
+            return_dict if return_dict is not None
+            else self.config.use_return_dict
+        )
+        if input_video_embeds is not None:
+            video_tokens = self.videomlp(input_video_embeds)
+        else:
+            video_tokens = None
+        if target_video_hidden_states is not None:
+            target_video_hidden_states = self.videomlp(
+                target_video_hidden_states)
+            non_masked_frame_hidden_states = video_tokens.masked_select(
+                non_masked_frame_mask.unsqueeze(-1)
+            ).view(-1, self.hidden_size)
+        outputs = self.bert(
+            input_ids,
+            video_tokens,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+        mfm_scores, prediction_scores = None, None
+        if masked_frame_labels is not None and masked_lm_labels is not None:
+            # split the sequence.
+            text_offset = masked_frame_labels.size(1) + 1  # [CLS]
+            video_sequence_output = sequence_output[
+                :, 1:text_offset
+            ]  # remove [SEP] as not in video_label.
+            text_sequence_output = torch.cat(
+                [sequence_output[:, :1], sequence_output[:, text_offset:]],
+                dim=1
+            )
+            hidden_size = video_sequence_output.size(-1)
+            selected_video_output = video_sequence_output.masked_select(
+                masked_frame_labels.unsqueeze(-1)
+            ).view(-1, hidden_size)
+            # only compute select tokens to training to speed up.
+            hidden_size = text_sequence_output.size(-1)
+            # masked_lm_labels = masked_lm_labels.reshape(-1)
+            labels_mask = masked_lm_labels != -100
+            selected_text_output = text_sequence_output.masked_select(
+                labels_mask.unsqueeze(-1)
+            ).view(-1, hidden_size)
+            mfm_scores, prediction_scores = self.cls(
+                selected_video_output,
+                target_video_hidden_states,
+                non_masked_frame_hidden_states,
+                selected_text_output,
+            )
+        output = (
+            mfm_scores,
+            prediction_scores,
+        ) + outputs
+        return output
+class BertMFMMLMPredictionHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.transform = BertPredictionHeadTransform(config)
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = nn.Linear(
+            config.hidden_size, config.vocab_size, bias=False)
+        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+        # Need a link between the two variables so that the bias is correctly
+        # resized with `resize_token_embeddings`
+        self.decoder.bias = self.bias
+    def forward(
+        self,
+        video_hidden_states=None,
+        target_video_hidden_states=None,
+        non_masked_frame_hidden_states=None,
+        text_hidden_states=None,
+    ):
+        video_logits, text_logits = None, None
+        if video_hidden_states is not None:
+            video_hidden_states = self.transform(video_hidden_states)
+            non_masked_frame_logits = torch.mm(
+                video_hidden_states,
+                non_masked_frame_hidden_states.transpose(1, 0)
+            )
+            masked_frame_logits = torch.bmm(
+                video_hidden_states.unsqueeze(1),
+                target_video_hidden_states.unsqueeze(-1),
+            ).squeeze(-1)
+            video_logits = torch.cat(
+                [masked_frame_logits, non_masked_frame_logits], dim=1
+            )
+        if text_hidden_states is not None:
+            text_hidden_states = self.transform(text_hidden_states)
+            text_logits = self.decoder(text_hidden_states)
+        return video_logits, text_logits
+class MFMMLMHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = BertMFMMLMPredictionHead(config)
+    def forward(
+        self,
+        video_hidden_states=None,
+        target_video_hidden_states=None,
+        non_masked_frame_hidden_states=None,
+        text_hidden_states=None,
+    ):
+        video_logits, text_logits = self.predictions(
+            video_hidden_states,
+            target_video_hidden_states,
+            non_masked_frame_hidden_states,
+            text_hidden_states,
+        )
+        return video_logits, text_logits
+class MMBertForMTM(MMBertForMFMMLM):
+    def __init__(self, config):
+        BertPreTrainedModel.__init__(self, config)
+        self.videomlp = VideoTokenMLP(config)
+        self.bert = MMBertModel(config)
+        self.cls = MTMHead(config)
+        self.hidden_size = config.hidden_size
+        self.init_weights()
+class BertMTMPredictionHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.transform = BertPredictionHeadTransform(config)
+        self.decoder = nn.Linear(
+            config.hidden_size, config.vocab_size, bias=False)
+    def forward(
+        self,
+        video_hidden_states=None,
+        target_video_hidden_states=None,
+        non_masked_frame_hidden_states=None,
+        text_hidden_states=None,
+    ):
+        non_masked_frame_hidden_states = non_masked_frame_hidden_states.transpose(1, 0)
+        video_logits, text_logits = None, None
+        if video_hidden_states is not None:
+            video_hidden_states = self.transform(video_hidden_states)
+            masked_frame_logits = torch.bmm(
+                video_hidden_states.unsqueeze(1),
+                target_video_hidden_states.unsqueeze(-1),
+            ).squeeze(-1)
+            non_masked_frame_logits = torch.mm(
+                video_hidden_states,
+                non_masked_frame_hidden_states
+            )
+            video_on_vocab_logits = self.decoder(video_hidden_states)
+            video_logits = torch.cat([
+                masked_frame_logits,
+                non_masked_frame_logits,
+                video_on_vocab_logits], dim=1)
+        if text_hidden_states is not None:
+            text_hidden_states = self.transform(text_hidden_states)
+            # text first so label does not need to be shifted.
+            text_on_vocab_logits = self.decoder(text_hidden_states)
+            text_on_video_logits = torch.mm(
+                text_hidden_states,
+                non_masked_frame_hidden_states
+            )
+            text_logits = torch.cat([
+                text_on_vocab_logits,
+                text_on_video_logits
+            ], dim=1)
+        return video_logits, text_logits
+class MTMHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = BertMTMPredictionHead(config)
+    def forward(
+        self,
+        video_hidden_states=None,
+        target_video_hidden_states=None,
+        non_masked_frame_hidden_states=None,
+        text_hidden_states=None,
+    ):
+        video_logits, text_logits = self.predictions(
+            video_hidden_states,
+            target_video_hidden_states,
+            non_masked_frame_hidden_states,
+            text_hidden_states,
+        )
+        return video_logits, text_logits
+class MMBertModel(BertModel):
+    """MMBertModel has MMBertEmbedding to support video tokens."""
+    def __init__(self, config, add_pooling_layer=True):
+        super().__init__(config)
+        # overwrite embedding
+        self.embeddings = MMBertEmbeddings(config)
+        self.encoder = MultiLayerAttentionMaskBertEncoder(config)
+        self.init_weights()
+    def forward(
+        self,
+        input_ids=None,
+        input_video_embeds=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        separate_forward_split=None,
+    ):
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        return_dict = (
+            return_dict if return_dict is not None
+            else self.config.use_return_dict
+        )
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError(
+                "You cannot specify both input_ids "
+                "and inputs_embeds at the same time"
+            )
+        elif input_ids is not None:
+            if input_video_embeds is not None:
+                input_shape = (
+                    input_ids.size(0),
+                    input_ids.size(1) + input_video_embeds.size(1),
+                )
+            else:
+                input_shape = (
+                    input_ids.size(0),
+                    input_ids.size(1),
+                )
+        elif inputs_embeds is not None:
+            if input_video_embeds is not None:
+                input_shape = (
+                    inputs_embeds.size(0),
+                    inputs_embeds.size(1) + input_video_embeds.size(1),
+                )
+            else:
+                input_shape = (
+                    input_ids.size(0),
+                    input_ids.size(1),
+                )
+        else:
+            raise ValueError(
+                "You have to specify either input_ids or inputs_embeds")
+        device = input_ids.device if input_ids is not None \
+            else inputs_embeds.device
+        if attention_mask is None:
+            attention_mask = torch.ones(input_shape, device=device)
+        if token_type_ids is None:
+            token_type_ids = torch.zeros(
+                input_shape, dtype=torch.long, device=device)
+        # We can provide a self-attention mask of dimensions
+        # [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case
+        # we just need to make it broadcastable to all heads.
+        extended_attention_mask: torch.Tensor = \
+            self.get_extended_attention_mask(
+                attention_mask, input_shape, device)
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to
+        # [batch_size, num_heads, seq_length, seq_length]
+        if self.config.is_decoder and encoder_hidden_states is not None:
+            (
+                encoder_batch_size,
+                encoder_sequence_length,
+                _,
+            ) = encoder_hidden_states.size()
+            encoder_hidden_shape = (
+                encoder_batch_size, encoder_sequence_length)
+            if encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(
+                    encoder_hidden_shape, device=device)
+            encoder_extended_attention_mask = self.invert_attention_mask(
+                encoder_attention_mask
+            )
+        else:
+            encoder_extended_attention_mask = None
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or
+        # [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape
+        # [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(
+            head_mask, self.config.num_hidden_layers)
+        embedding_output = self.embeddings(
+            input_ids,
+            input_video_embeds,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+        )
+        if separate_forward_split is not None:
+            split_embedding_output = \
+                embedding_output[:, :separate_forward_split]
+            split_extended_attention_mask = extended_attention_mask[
+                :, :, :, :separate_forward_split, :separate_forward_split
+            ]
+            split_encoder_outputs = self.encoder(
+                split_embedding_output,
+                attention_mask=split_extended_attention_mask,
+                head_mask=head_mask,
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attention_mask=encoder_extended_attention_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+            assert (
+                len(split_encoder_outputs) <= 2
+            ), "we do not support merge on attention for now."
+            encoder_outputs = []
+            encoder_outputs.append([split_encoder_outputs[0]])
+            if len(split_encoder_outputs) == 2:
+                encoder_outputs.append([])
+                for _all_hidden_states in split_encoder_outputs[1]:
+                    encoder_outputs[-1].append([_all_hidden_states])
+            split_embedding_output = \
+                embedding_output[:, separate_forward_split:]
+            split_extended_attention_mask = extended_attention_mask[
+                :, :, :, separate_forward_split:, separate_forward_split:
+            ]
+            split_encoder_outputs = self.encoder(
+                split_embedding_output,
+                attention_mask=split_extended_attention_mask,
+                head_mask=head_mask,
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attention_mask=encoder_extended_attention_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+            assert (
+                len(split_encoder_outputs) <= 2
+            ), "we do not support merge on attention for now."
+            encoder_outputs[0].append(split_encoder_outputs[0])
+            encoder_outputs[0] = torch.cat(encoder_outputs[0], dim=1)
+            if len(split_encoder_outputs) == 2:
+                for layer_idx, _all_hidden_states in enumerate(
+                    split_encoder_outputs[1]
+                ):
+                    encoder_outputs[1][layer_idx].append(_all_hidden_states)
+                    encoder_outputs[1][layer_idx] = torch.cat(
+                        encoder_outputs[1][layer_idx], dim=1
+                    )
+            encoder_outputs = tuple(encoder_outputs)
+        else:
+            encoder_outputs = self.encoder(
+                embedding_output,
+                attention_mask=extended_attention_mask,
+                head_mask=head_mask,
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attention_mask=encoder_extended_attention_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        sequence_output = encoder_outputs[0]
+        pooled_output = (
+            self.pooler(sequence_output) if self.pooler is not None else None
+        )
+        return (sequence_output, pooled_output) + encoder_outputs[1:]
+    def get_extended_attention_mask(self, attention_mask, input_shape, device):
+        """This is borrowed from `modeling_utils.py` with the support of
+        multi-layer attention masks.
+        The second dim is expected to be number of layers.
+        See `MMAttentionMaskProcessor`.
+        Makes broadcastable attention and causal masks so that future
+        and masked tokens are ignored.
+        Arguments:
+            attention_mask (:obj:`torch.Tensor`):
+                Mask with ones indicating tokens to attend to,
+                zeros for tokens to ignore.
+            input_shape (:obj:`Tuple[int]`):
+                The shape of the input to the model.
+            device: (:obj:`torch.device`):
+                The device of the input to the model.
+        Returns:
+            :obj:`torch.Tensor` The extended attention mask, \
+                with a the same dtype as :obj:`attention_mask.dtype`.
+        """
+        # We can provide a self-attention mask of dimensions
+        # [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable
+        # to all heads.
+        if attention_mask.dim() == 4:
+            extended_attention_mask = attention_mask[:, :, None, :, :]
+            extended_attention_mask = extended_attention_mask.to(
+                dtype=self.dtype
+            )  # fp16 compatibility
+            extended_attention_mask = (1.0 - extended_attention_mask) \
+                * -10000.0
+            return extended_attention_mask
+        else:
+            return super().get_extended_attention_mask(
+                attention_mask, input_shape, device
+            )
+class MultiLayerAttentionMaskBertEncoder(BertEncoder):
+    """extend BertEncoder with the capability of
+    multiple layers of attention mask."""
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=False,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            layer_attention_mask = (
+                attention_mask[:, i, :, :, :]
+                if attention_mask.dim() == 5
+                else attention_mask
+            )
+            if getattr(self.config, "gradient_checkpointing", False):
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, output_attentions)
+                    return custom_forward
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    layer_attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states,
+                    layer_attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    output_attentions,
+                )
+            hidden_states = layer_outputs[0]
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+        return tuple(
+            v
+            for v in [hidden_states, all_hidden_states, all_attentions]
+            if v is not None
+        )

fairseq-a54021305d6b3c4c5959ac9395135f63202db8f1/examples/MMPT/mmpt/modules/__init__.py ADDED Viewed

	@@ -0,0 +1,10 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from .mm import *
+try:
+    from .expmm import *
+except ImportError:
+    pass

fairseq-a54021305d6b3c4c5959ac9395135f63202db8f1/examples/MMPT/mmpt/modules/mm.py ADDED Viewed

	@@ -0,0 +1,145 @@

+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Copyright (c) Facebook, Inc. All Rights Reserved
+import torch
+from torch import nn
+try:
+    from transformers.modeling_bert import (
+        BertEmbeddings,
+        ACT2FN,
+    )
+except ImportError:
+    pass
+class VideoTokenMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        input_dim = config.input_dim if hasattr(config, "input_dim") else 512
+        self.linear1 = nn.Linear(input_dim, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size)
+        self.activation = ACT2FN[config.hidden_act]
+        self.linear2 = nn.Linear(config.hidden_size, config.hidden_size)
+    def forward(self, hidden_states):
+        hidden_states = self.linear1(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        hidden_states = self.linear2(hidden_states)
+        return hidden_states
+class MMBertEmbeddings(BertEmbeddings):
+    def __init__(self, config):
+        super().__init__(config)
+        self.max_video_len = config.max_video_len
+        if hasattr(config, "use_seg_emb") and config.use_seg_emb:
+            """the original VLM paper uses seg_embeddings for temporal space.
+            although not used it changed the randomness of initialization.
+            we keep it for reproducibility.
+            """
+            self.seg_embeddings = nn.Embedding(256, config.hidden_size)
+    def forward(
+        self,
+        input_ids,
+        input_video_embeds,
+        token_type_ids=None,
+        position_ids=None,
+        inputs_embeds=None,
+    ):
+        input_tensor = input_ids if input_ids is not None else inputs_embeds
+        if input_video_embeds is not None:
+            input_shape = (
+                input_tensor.size(0),
+                input_tensor.size(1) + input_video_embeds.size(1),
+            )
+        else:
+            input_shape = (input_tensor.size(0), input_tensor.size(1))
+        if position_ids is None:
+            """
+            Auto skip position embeddings for text only case.
+            use cases:
+            (1) action localization and segmentation:
+                feed in len-1 dummy video token needs text part to
+                skip input_video_embeds.size(1) for the right
+                position_ids for video [SEP] and rest text tokens.
+            (2) MMFusionShare for two forward passings:
+                in `forward_text`: input_video_embeds is None.
+                    need to skip video [SEP] token.
+            # video_len + 1: [CLS] + video_embed
+            # self.max_video_len + 1: [SEP] for video.
+            # self.max_video_len + 2: [SEP] for video.
+            # self.max_video_len + input_ids.size(1): rest for text.
+            """
+            if input_video_embeds is not None:
+                video_len = input_video_embeds.size(1)
+                starting_offset = self.max_video_len + 1  # video [SEP]
+                ending_offset = self.max_video_len + input_ids.size(1)
+            else:
+                video_len = 0
+                starting_offset = self.max_video_len + 2  # first text token.
+                ending_offset = self.max_video_len + input_ids.size(1) + 1
+            position_ids = torch.cat([
+                self.position_ids[:, :video_len + 1],
+                self.position_ids[:, starting_offset:ending_offset]
+                ], dim=1)
+        if token_type_ids is None:
+            token_type_ids = torch.zeros(
+                input_shape, dtype=torch.long, device=self.position_ids.device
+            )
+        """
+        the format of input_ids is [CLS] [SEP] caption [SEP] padding.
+        the goal is to build [CLS] video tokens [SEP] caption [SEP] .
+        """
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+        if input_video_embeds is not None:
+            inputs_mm_embeds = torch.cat([
+                inputs_embeds[:, :1], input_video_embeds, inputs_embeds[:, 1:]
+            ], dim=1)
+        else:
+            # text only for `MMFusionShare`.
+            inputs_mm_embeds = inputs_embeds
+        position_embeddings = self.position_embeddings(position_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+        embeddings = inputs_mm_embeds + position_embeddings
+        embeddings += token_type_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+class AlignHead(nn.Module):
+    """this will load pre-trained weights for NSP, which is desirable."""
+    def __init__(self, config):
+        super().__init__()
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+    def forward(self, dropout_pooled_output):
+        logits = self.seq_relationship(dropout_pooled_output)
+        return logits