Spaces:

Kurokabe
/

GANime

Runtime error

App Files Files Community

Kurokabe commited on Jan 22, 2023

Commit

3be620b

•

1 Parent(s): 2a06e99

Upload 84 files

Browse files

Add application files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.dockerignore +4 -0
.gitattributes +1 -0
.gitignore +208 -0
Dockerfile +21 -0
configs/colab.yaml +50 -0
configs/kny_image.yaml +47 -0
configs/kny_image_full_style.yaml +47 -0
configs/kny_image_full_vgg19.yaml +47 -0
configs/kny_transformer_light.yaml +60 -0
configs/kny_video_gpt2_large.yaml +50 -0
configs/kny_video_gpt2_large_gradio.yaml +50 -0
configs/kny_video_gpt2_medium.yaml +50 -0
configs/kny_video_gpt2_xl.yaml +50 -0
ganime/__main__.py +4 -0
ganime/app.py +212 -0
ganime/configs/__init__.py +0 -0
ganime/configs/model_configs.py +70 -0
ganime/data/__init__.py +0 -0
ganime/data/base.py +282 -0
ganime/data/experimental.py +222 -0
ganime/data/kny.py +19 -0
ganime/data/mnist.py +103 -0
ganime/metrics/image.py +70 -0
ganime/metrics/video.py +98 -0
ganime/model/__init__.py +0 -0
ganime/model/base.py +45 -0
ganime/model/moving_vae.py +126 -0
ganime/model/p2p/__init__.py +0 -0
ganime/model/p2p/p2p.py +543 -0
ganime/model/p2p/p2p_test.py +713 -0
ganime/model/p2p/p2p_v2.py +498 -0
ganime/model/p2p/p2p_v3.py +237 -0
ganime/model/vae/vae.py +98 -0
ganime/model/vq_vae/vq_vae.py +143 -0
ganime/model/vqgan/__init__.py +0 -0
ganime/model/vqgan/discriminator/__init__.py +0 -0
ganime/model/vqgan/discriminator/model.py +64 -0
ganime/model/vqgan/losses/__init__.py +0 -0
ganime/model/vqgan/losses/lpips.py +134 -0
ganime/model/vqgan/losses/vqperceptual.py +47 -0
ganime/model/vqgan/vqgan.py +722 -0
ganime/model/vqgan_clean/__init__.py +0 -0
ganime/model/vqgan_clean/diffusion/__init__.py +0 -0
ganime/model/vqgan_clean/diffusion/decoder.py +115 -0
ganime/model/vqgan_clean/diffusion/encoder.py +125 -0
ganime/model/vqgan_clean/diffusion/layers.py +179 -0
ganime/model/vqgan_clean/discriminator/__init__.py +0 -0
ganime/model/vqgan_clean/discriminator/model.py +88 -0
ganime/model/vqgan_clean/discriminator/model_bkp.py +76 -0
ganime/model/vqgan_clean/experimental/gpt2_embedding.py +1127 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,4 @@

+.git
+data
+checkpoints
+logs

.gitattributes CHANGED Viewed

@@ -32,3 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+models/vgg19/imagenet-vgg-verydeep-19.mat filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,208 @@

+# Created by https://www.toptal.com/developers/gitignore/api/visualstudiocode,python,venv
+# Edit at https://www.toptal.com/developers/gitignore?templates=visualstudiocode,python,venv
+### Python ###
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+### venv ###
+# Virtualenv
+# http://iamzed.com/2009/05/07/a-primer-on-virtualenv/
+[Bb]in
+[Ii]nclude
+[Ll]ib
+[Ll]ib64
+[Ll]ocal
+#[Ss]cripts
+pyvenv.cfg
+pip-selfcheck.json
+### VisualStudioCode ###
+.vscode/*
+# !.vscode/settings.json
+# !.vscode/tasks.json
+# !.vscode/launch.json
+# !.vscode/extensions.json
+# !.vscode/*.code-snippets
+# Local History for Visual Studio Code
+.history/
+# Built Visual Studio Code Extensions
+*.vsix
+### VisualStudioCode Patch ###
+# Ignore all local history of files
+.history
+.ionide
+# Support for Project snippet scope
+# End of https://www.toptal.com/developers/gitignore/api/visualstudiocode,python,venv
+*.npy
+checkpoints/*
+ganime_results/*
+data/*
+*.avi
+*.out
+notebooks/model/p2p_v2/*
+logs/*
+interesting_logs/*
+notebooks/model/vq-gan/train_output/*
+notebooks/model/vq-gan/validation_output/*
+notebooks/model/vq-gan/test_output/*
+*.zip
+flagged/*
+notebooks/model/vq-gan/gpt_kny_light_large_256/*

Dockerfile ADDED Viewed

	@@ -0,0 +1,21 @@

+FROM tensorflow/tensorflow:2.7.0-gpu-jupyter
+# Because of https://developer.nvidia.com/blog/updating-the-cuda-linux-gpg-repository-key/ and https://github.com/NVIDIA/nvidia-docker/issues/1631#issuecomment-1112828208
+RUN rm /etc/apt/sources.list.d/cuda.list
+RUN rm /etc/apt/sources.list.d/nvidia-ml.list
+RUN apt-key del 7fa2af80
+RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub
+RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu2004/x86_64/7fa2af80.pub
+# Update and install ffmpeg
+RUN apt-get -y update
+RUN apt-get -y upgrade
+RUN apt-get install -y ffmpeg
+# Setup environment
+WORKDIR /GANime
+ENV PROJECT_DIR=/GANime
+COPY requirements.txt /GANime/requirements.txt
+RUN pip install -r requirements.txt
+COPY . .
+RUN pip install -e .
+EXPOSE 8888

configs/colab.yaml ADDED Viewed

	@@ -0,0 +1,50 @@

+model:
+  transformer_config:
+    #checkpoint_path: GANime/checkpoints/kny_video_full_gpt2_medium/checkpoint
+    remaining_frames_method: "own_embeddings"
+    transformer_type: "gpt2-medium"
+  first_stage_config:
+    checkpoint_path: GANime/checkpoints/kny_image_full_vgg19/checkpoint
+    vqvae_config:
+      beta: 0.25
+      num_embeddings: 50257
+      embedding_dim: 128
+    autoencoder_config:
+      z_channels: 512
+      channels: 32
+      channels_multiplier:
+      - 2
+      - 4
+      - 8
+      - 8
+      num_res_blocks: 1
+      attention_resolution:
+      - 16
+      resolution: 128
+      dropout: 0.0
+    discriminator_config:
+      num_layers: 3
+      filters: 64
+    loss_config:
+      discriminator:
+        loss: "hinge"
+        factor: 1.0
+        iter_start: 16200
+        weight: 0.3
+      vqvae:
+        codebook_weight: 1.0
+        perceptual_weight: 4.0
+      perceptual_loss: "vgg19"
+train:
+  batch_size: 64
+  accumulation_size: 1
+  n_epochs: 2000
+  len_x_train: 8000
+  warmup_epoch_percentage: 0.15
+  lr_start: 1e-5
+  lr_max: 2.5e-4
+  perceptual_loss_weight: 1.0
+  n_frames_before: 1
+  stop_ground_truth_after_epoch: 50

configs/kny_image.yaml ADDED Viewed

	@@ -0,0 +1,47 @@

+model:
+  checkpoint_path: ../../../checkpoints/kny_image_full_no_disc/checkpoint
+  vqvae_config:
+    beta: 0.25
+    num_embeddings: 50257
+    embedding_dim: 128
+  autoencoder_config:
+    z_channels: 512
+    channels: 32
+    channels_multiplier:
+    - 2
+    - 4
+    - 8
+    - 8
+    num_res_blocks: 1
+    attention_resolution:
+    - 16
+    resolution: 128
+    dropout: 0.0
+  discriminator_config:
+    num_layers: 3
+    filters: 64
+  loss_config:
+    discriminator:
+      loss: "hinge"
+      factor: 1.0
+      iter_start: 5000
+      weight: 0.8
+    vqvae:
+      codebook_weight: 1.0
+      perceptual_weight: 4.0
+    perceptual_loss: "vgg19" # "vgg16", "vgg19", "style"
+trainer:
+  batch_size: 32
+  n_epochs: 10000
+  gen_lr: 3e-5
+  disc_lr: 3e-5
+  gen_beta_1: 0.5
+  gen_beta_2: 0.9
+  disc_beta_1: 0.5
+  disc_beta_2: 0.9
+  gen_clip_norm: 1.0
+  disc_clip_norm: 1.0

configs/kny_image_full_style.yaml ADDED Viewed

	@@ -0,0 +1,47 @@

+model:
+  checkpoint_path: ../../../checkpoints/kny_image_full_style/checkpoint
+  vqvae_config:
+    beta: 0.25
+    num_embeddings: 50257
+    embedding_dim: 128
+  autoencoder_config:
+    z_channels: 512
+    channels: 32
+    channels_multiplier:
+    - 2
+    - 4
+    - 8
+    - 8
+    num_res_blocks: 1
+    attention_resolution:
+    - 16
+    resolution: 128
+    dropout: 0.0
+  discriminator_config:
+    num_layers: 3
+    filters: 64
+  loss_config:
+    discriminator:
+      loss: "hinge"
+      factor: 1.0
+      iter_start: 50000000
+      weight: 0.8
+    vqvae:
+      codebook_weight: 1.0
+      perceptual_weight: 4.0
+    perceptual_loss: "style" # "vgg16", "vgg19", "style"
+trainer:
+  batch_size: 32
+  n_epochs: 10000
+  gen_lr: 8e-5
+  disc_lr: 8e-5
+  gen_beta_1: 0.5
+  gen_beta_2: 0.9
+  disc_beta_1: 0.5
+  disc_beta_2: 0.9
+  gen_clip_norm: 1.0
+  disc_clip_norm: 1.0

configs/kny_image_full_vgg19.yaml ADDED Viewed

	@@ -0,0 +1,47 @@

+model:
+  checkpoint_path: ../../../checkpoints/kny_image_full_vgg19/checkpoint
+  vqvae_config:
+    beta: 0.25
+    num_embeddings: 50257
+    embedding_dim: 128
+  autoencoder_config:
+    z_channels: 512
+    channels: 32
+    channels_multiplier:
+    - 2
+    - 4
+    - 8
+    - 8
+    num_res_blocks: 1
+    attention_resolution:
+    - 16
+    resolution: 128
+    dropout: 0.0
+  discriminator_config:
+    num_layers: 3
+    filters: 64
+  loss_config:
+    discriminator:
+      loss: "hinge"
+      factor: 1.0
+      iter_start: 50000000
+      weight: 0.8
+    vqvae:
+      codebook_weight: 1.0
+      perceptual_weight: 4.0
+    perceptual_loss: "vgg19" # "vgg16", "vgg19", "style"
+trainer:
+  batch_size: 64
+  n_epochs: 10000
+  gen_lr: 3e-5
+  disc_lr: 5e-5
+  gen_beta_1: 0.5
+  gen_beta_2: 0.9
+  disc_beta_1: 0.5
+  disc_beta_2: 0.9
+  gen_clip_norm: 1.0
+  disc_clip_norm: 1.0

configs/kny_transformer_light.yaml ADDED Viewed

	@@ -0,0 +1,60 @@

+model:
+  transformer_config:
+    checkpoint_path: ../../../checkpoints/kny_video_light/checkpoint
+    # vocab_size: 50257
+    # n_positions: 1024
+    # n_embd: 1024 #1280 #768
+    # n_layer: 24 #36 #12
+    # n_head: 16 #20 #12
+    # resid_pdrop: 0.1
+    # embd_pdrop: 0.1
+    # attn_pdrop: 0.1
+    # remaining_frames_method: "concat"
+    # remaining_frames_method: "token_type_ids"
+    remaining_frames_method: "own_embeddings"
+  first_stage_config:
+    checkpoint_path: ../../../checkpoints/kny_image_light_discriminator/checkpoint
+    vqvae_config:
+      beta: 0.25
+      num_embeddings: 64
+      embedding_dim: 256
+    autoencoder_config:
+      z_channels: 128
+      channels: 64
+      channels_multiplier:
+      - 1
+      - 1
+      - 2
+      - 2
+      - 4
+      num_res_blocks: 1
+      attention_resolution:
+      - 16
+      resolution: 128
+      dropout: 0.0
+    discriminator_config:
+      num_layers: 3
+      filters: 64
+    loss_config:
+      discriminator:
+        loss: "hinge"
+        factor: 1.0
+        iter_start: 16200
+        weight: 0.3
+      vqvae:
+        codebook_weight: 1.0
+        perceptual_weight: 4.0
+      perceptual_loss: "style"
+train:
+  batch_size: 8
+  accumulation_size: 8
+  n_epochs: 2000
+  len_x_train: 631
+  warmup_epoch_percentage: 0.15
+  lr_start: 1e-5
+  lr_max: 2.5e-4
+  perceptual_loss_weight: 1.0
+  n_frames_before: 5
+  stop_ground_truth_after_epoch: 100

configs/kny_video_gpt2_large.yaml ADDED Viewed

	@@ -0,0 +1,50 @@

+model:
+  transformer_config:
+    checkpoint_path: ../../../checkpoints/kny_video_full_gpt2_large_final/checkpoint
+    remaining_frames_method: "own_embeddings"
+    transformer_type: "gpt2-large"
+  first_stage_config:
+    checkpoint_path: ../../../checkpoints/kny_image_full_vgg19/checkpoint
+    vqvae_config:
+      beta: 0.25
+      num_embeddings: 50257
+      embedding_dim: 128
+    autoencoder_config:
+      z_channels: 512
+      channels: 32
+      channels_multiplier:
+      - 2
+      - 4
+      - 8
+      - 8
+      num_res_blocks: 1
+      attention_resolution:
+      - 16
+      resolution: 128
+      dropout: 0.0
+    discriminator_config:
+      num_layers: 3
+      filters: 64
+    loss_config:
+      discriminator:
+        loss: "hinge"
+        factor: 1.0
+        iter_start: 16200
+        weight: 0.3
+      vqvae:
+        codebook_weight: 1.0
+        perceptual_weight: 4.0
+      perceptual_loss: "vgg19"
+train:
+  batch_size: 64
+  accumulation_size: 1
+  n_epochs: 10000
+  len_x_train: 28213
+  warmup_epoch_percentage: 0.15
+  lr_start: 1e-5
+  lr_max: 2.5e-4
+  perceptual_loss_weight: 1.0
+  n_frames_before: 1
+  stop_ground_truth_after_epoch: 1000

configs/kny_video_gpt2_large_gradio.yaml ADDED Viewed

	@@ -0,0 +1,50 @@

+model:
+  transformer_config:
+    checkpoint_path: ./checkpoints/kny_video_full_gpt2_large_final/checkpoint
+    remaining_frames_method: "own_embeddings"
+    transformer_type: "gpt2-large"
+  first_stage_config:
+    checkpoint_path: ./checkpoints/kny_image_full_vgg19/checkpoint
+    vqvae_config:
+      beta: 0.25
+      num_embeddings: 50257
+      embedding_dim: 128
+    autoencoder_config:
+      z_channels: 512
+      channels: 32
+      channels_multiplier:
+      - 2
+      - 4
+      - 8
+      - 8
+      num_res_blocks: 1
+      attention_resolution:
+      - 16
+      resolution: 128
+      dropout: 0.0
+    discriminator_config:
+      num_layers: 3
+      filters: 64
+    loss_config:
+      discriminator:
+        loss: "hinge"
+        factor: 1.0
+        iter_start: 16200
+        weight: 0.3
+      vqvae:
+        codebook_weight: 1.0
+        perceptual_weight: 4.0
+      perceptual_loss: "vgg19"
+train:
+  batch_size: 64
+  accumulation_size: 1
+  n_epochs: 10000
+  len_x_train: 28213
+  warmup_epoch_percentage: 0.15
+  lr_start: 1e-5
+  lr_max: 2.5e-4
+  perceptual_loss_weight: 1.0
+  n_frames_before: 1
+  stop_ground_truth_after_epoch: 1000

configs/kny_video_gpt2_medium.yaml ADDED Viewed

	@@ -0,0 +1,50 @@

+model:
+  transformer_config:
+    checkpoint_path: ./checkpoints/kny_video_full_gpt2_medium/checkpoint
+    remaining_frames_method: "own_embeddings"
+    transformer_type: "gpt2-medium"
+  first_stage_config:
+    checkpoint_path: ./checkpoints/kny_image_full_vgg19/checkpoint
+    vqvae_config:
+      beta: 0.25
+      num_embeddings: 50257
+      embedding_dim: 128
+    autoencoder_config:
+      z_channels: 512
+      channels: 32
+      channels_multiplier:
+      - 2
+      - 4
+      - 8
+      - 8
+      num_res_blocks: 1
+      attention_resolution:
+      - 16
+      resolution: 128
+      dropout: 0.0
+    discriminator_config:
+      num_layers: 3
+      filters: 64
+    loss_config:
+      discriminator:
+        loss: "hinge"
+        factor: 1.0
+        iter_start: 16200
+        weight: 0.3
+      vqvae:
+        codebook_weight: 1.0
+        perceptual_weight: 4.0
+      perceptual_loss: "vgg19"
+train:
+  batch_size: 64
+  accumulation_size: 1
+  n_epochs: 500
+  len_x_train: 28213
+  warmup_epoch_percentage: 0.15
+  lr_start: 5e-6
+  lr_max: 1e-4
+  perceptual_loss_weight: 1.0
+  n_frames_before: 5
+  stop_ground_truth_after_epoch: 200

configs/kny_video_gpt2_xl.yaml ADDED Viewed

	@@ -0,0 +1,50 @@

+model:
+  transformer_config:
+    # checkpoint_path: ../../../checkpoints/kny_video_full_gpt2_xl/checkpoint
+    remaining_frames_method: "own_embeddings"
+    transformer_type: "gpt2-xl"
+  first_stage_config:
+    checkpoint_path: ../../../checkpoints/kny_image_full_vgg19/checkpoint
+    vqvae_config:
+      beta: 0.25
+      num_embeddings: 50257
+      embedding_dim: 128
+    autoencoder_config:
+      z_channels: 512
+      channels: 32
+      channels_multiplier:
+      - 2
+      - 4
+      - 8
+      - 8
+      num_res_blocks: 1
+      attention_resolution:
+      - 16
+      resolution: 128
+      dropout: 0.0
+    discriminator_config:
+      num_layers: 3
+      filters: 64
+    loss_config:
+      discriminator:
+        loss: "hinge"
+        factor: 1.0
+        iter_start: 16200
+        weight: 0.3
+      vqvae:
+        codebook_weight: 1.0
+        perceptual_weight: 4.0
+      perceptual_loss: "vgg19"
+train:
+  batch_size: 64
+  accumulation_size: 1
+  n_epochs: 500
+  len_x_train: 28213
+  warmup_epoch_percentage: 0.15
+  lr_start: 5e-6
+  lr_max: 1e-4
+  perceptual_loss_weight: 1.0
+  n_frames_before: 1
+  stop_ground_truth_after_epoch: 200

ganime/__main__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from ganime import app
+if __name__ == "__main__":
+    app.run()

ganime/app.py ADDED Viewed

	@@ -0,0 +1,212 @@

+import os
+import click
+import omegaconf
+import ray
+from pyprojroot.pyprojroot import here
+from ray import tune
+from ray.train import Trainer
+from ray.tune.schedulers import AsyncHyperBandScheduler
+from ray.tune.suggest import ConcurrencyLimiter
+from ray.tune.suggest.optuna import OptunaSearch
+from ganime.trainer.ganime import TrainableGANime
+import os
+os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"  # see issue #152
+os.environ["CUDA_VISIBLE_DEVICES"] = "1, 2, 3, 4, 5, 6"
+def get_metric_direction(metric: str):
+    if "loss" in metric:
+        return "min"
+    else:
+        raise ValueError(f"Unknown metric: {metric}")
+def trial_name_id(trial):
+    return f"{trial.trainable_name}"
+def trial_dirname_creator(trial):
+    return f"{trial.trial_id}"
+def get_search_space(model):
+    if model == "vqgan":
+        return {
+            # "beta": tune.uniform(0.1, 1.0),
+            "num_embeddings": tune.choice([64, 128, 256]),
+            "embedding_dim": tune.choice([128, 256, 512, 1024]),
+            "z_channels": tune.choice([64, 128, 256]),
+            "channels": tune.choice([64, 128, 256]),
+            "channels_multiplier": tune.choice(
+                [
+                    [1, 2, 4],
+                    [1, 1, 2, 2],
+                    [1, 2, 2, 4],
+                    [1, 1, 2, 2, 4],
+                ]
+            ),
+            "attention_resolution": tune.choice([[16], [32], [16, 32]]),
+            "batch_size": tune.choice([8, 16]),
+            "dropout": tune.choice([0.0, 0.1, 0.2]),
+            "weight": tune.quniform(0.1, 1.0, 0.1),
+            "codebook_weight": tune.quniform(0.2, 2.0, 0.2),
+            "perceptual_weight": tune.quniform(0.5, 5.0, 0.5),
+            "gen_lr": tune.qloguniform(1e-5, 1e-3, 1e-5),
+            "disc_lr": tune.qloguniform(1e-5, 1e-3, 1e-5),
+            "gen_beta_1": tune.quniform(0.5, 0.9, 0.1),
+            "gen_beta_2": tune.quniform(0.9, 0.999, 0.001),
+            "disc_beta_1": tune.quniform(0.5, 0.9, 0.1),
+            "disc_beta_2": tune.quniform(0.9, 0.999, 0.001),
+            "gen_clip_norm": tune.choice([1.0, None]),
+            "disc_clip_norm": tune.choice([1.0, None]),
+        }
+    elif model == "gpt":
+        return {
+            "remaining_frames_method": tune.choice(
+                ["concat", "token_type_ids", "own_embeddings"]
+            ),
+            # "batch_size": tune.choice([8, 16]),
+            "lr_max": tune.qloguniform(1e-5, 1e-3, 5e-5),
+            "lr_start": tune.sample_from(lambda spec: spec.config.lr_max / 10),
+            "perceptual_loss_weight": tune.quniform(0.0, 1.0, 0.1),
+            "n_frames_before": tune.randint(1, 10),
+        }
+def tune_ganime(
+    experiment_name: str,
+    dataset_name: str,
+    config_file: str,
+    model: str,
+    metric: str,
+    epochs: int,
+    num_samples: int,
+    num_cpus: int,
+    num_gpus: int,
+    max_concurrent_trials: int,
+):
+    dataset_path = here("data")
+    analysis = tune.run(
+        TrainableGANime,
+        name=experiment_name,
+        search_alg=ConcurrencyLimiter(
+            OptunaSearch(), max_concurrent=max_concurrent_trials
+        ),
+        scheduler=AsyncHyperBandScheduler(max_t=epochs, grace_period=5),
+        metric=metric,
+        mode=get_metric_direction(metric),
+        num_samples=num_samples,
+        stop={"training_iteration": epochs},
+        local_dir="./ganime_results",
+        config={
+            "dataset_name": dataset_name,
+            "dataset_path": dataset_path,
+            "model": model,
+            "config_file": config_file,
+            "hyperparameters": get_search_space(model),
+        },
+        resources_per_trial={
+            "cpu": num_cpus // max_concurrent_trials,
+            "gpu": num_gpus / max_concurrent_trials,
+        },
+        trial_name_creator=trial_name_id,
+        trial_dirname_creator=trial_dirname_creator,
+    )
+    best_loss = analysis.get_best_config(metric="total_loss", mode="min")
+    # best_accuracy = analysis.get_best_config(metric="accuracy", mode="max")
+    print(f"Best loss config: {best_loss}")
+    # print(f"Best accuracy config: {best_accuracy}")
+    return analysis
+@click.command()
+@click.option(
+    "--dataset",
+    type=click.Choice(
+        ["moving_mnist_images", "kny_images", "kny_images_light"], case_sensitive=False
+    ),
+    default="kny_images_light",
+    help="Dataset to use",
+)
+@click.option(
+    "--model",
+    type=click.Choice(["vqgan", "gpt"], case_sensitive=False),
+    default="vqgan",
+    help="Model to use",
+)
+@click.option(
+    "--epochs",
+    default=500,
+    help="Number of epochs to run",
+)
+@click.option(
+    "--num_samples",
+    default=100,
+    help="Total number of trials to run",
+)
+@click.option(
+    "--num_cpus",
+    default=64,
+    help="Number of cpus to use",
+)
+@click.option(
+    "--num_gpus",
+    default=6,
+    help="Number of gpus to use",
+)
+@click.option(
+    "--max_concurrent_trials",
+    default=6,
+    help="Maximum number of concurrent trials",
+)
+@click.option(
+    "--metric",
+    type=click.Choice(
+        ["total_loss", "reconstruction_loss", "vq_loss", "disc_loss"],
+        case_sensitive=False,
+    ),
+    default="total_loss",
+    help="The metric used to select the best trial",
+)
+@click.option(
+    "--experiment_name",
+    default="kny_images_light_v2",
+    help="The name of the experiment for logging in Tensorboard",
+)
+@click.option(
+    "--config_file",
+    default="kny_image.yaml",
+    help="The name of the config file located inside ./config",
+)
+def run(
+    experiment_name: str,
+    config_file: str,
+    dataset: str,
+    model: str,
+    epochs: int,
+    num_samples: int,
+    num_cpus: int,
+    num_gpus: int,
+    max_concurrent_trials: int,
+    metric: str,
+):
+    config_file = here(os.path.join("configs", config_file))
+    ray.init(num_cpus=num_cpus, num_gpus=num_gpus)
+    tune_ganime(
+        experiment_name=experiment_name,
+        dataset_name=dataset,
+        config_file=config_file,
+        model=model,
+        epochs=epochs,
+        num_samples=num_samples,
+        num_cpus=num_cpus,
+        num_gpus=num_gpus,
+        max_concurrent_trials=max_concurrent_trials,
+        metric=metric,
+    )

ganime/configs/__init__.py ADDED Viewed

File without changes

ganime/configs/model_configs.py ADDED Viewed

	@@ -0,0 +1,70 @@

+from dataclasses import dataclass
+from typing import List
+try:
+    from typing import Literal
+except ImportError:
+    from typing_extensions import Literal
+@dataclass
+class GPTConfig:
+    n_layer: int
+    n_head: int
+    n_embedding: int
+    vocab_size: int
+    block_size: int
+    embedding_percentage_drop: float
+    attention_percentage_drop: float
+@dataclass
+class VQVAEConfig:
+    beta: float
+    num_embeddings: int
+    embedding_dim: int
+@dataclass
+class AutoencoderConfig:
+    z_channels: int
+    channels: int
+    channels_multiplier: List[int]
+    num_res_blocks: int
+    attention_resolution: List[int]
+    resolution: int
+    dropout: float
+@dataclass
+class DiscriminatorConfig:
+    num_layers: int
+    filters: int
+@dataclass
+class DiscriminatorLossConfig:
+    loss: Literal["hinge, vanilla"]
+    factor: float
+    iter_start: int
+    weight: float
+@dataclass
+class VQVAELossConfig:
+    codebook_weight: float
+    perceptual_weight: float
+@dataclass
+class LossConfig:
+    discriminator: DiscriminatorLossConfig
+    vqvae: VQVAELossConfig
+    perceptual_loss: str
+@dataclass
+class ModelConfig:
+    vqvae_config: VQVAEConfig
+    autoencoder_config: AutoencoderConfig
+    discriminator_config: DiscriminatorConfig
+    loss_config: LossConfig

ganime/data/__init__.py ADDED Viewed

File without changes

ganime/data/base.py ADDED Viewed

	@@ -0,0 +1,282 @@

+from typing import Tuple
+import numpy as np
+import tensorflow as tf
+import os
+from tensorflow.keras.utils import Sequence
+from abc import ABC, abstractmethod
+from typing import Literal
+import math
+from ganime.data.experimental import ImageDataset
+# class SequenceDataset(Sequence):
+#     def __init__(
+#         self,
+#         dataset_path: str,
+#         batch_size: int,
+#         split: Literal["train", "validation", "test"] = "train",
+#     ):
+#         self.batch_size = batch_size
+#         self.split = split
+#         self.data = self.load_data(dataset_path, split)
+#         self.data = self.preprocess_data(self.data)
+#         self.indices = np.arange(self.data.shape[0])
+#         self.on_epoch_end()
+#     @abstractmethod
+#     def load_data(self, dataset_path: str, split: str) -> np.ndarray:
+#         pass
+#     def preprocess_data(self, data: np.ndarray) -> np.ndarray:
+#         return data
+#     def __len__(self):
+#         return math.ceil(len(self.data) / self.batch_size)
+#     def __getitem__(self, idx):
+#         inds = self.indices[idx * self.batch_size : (idx + 1) * self.batch_size]
+#         batch_x = self.data[inds]
+#         batch_y = batch_x
+#         return batch_x, batch_y
+#     def get_fixed_batch(self, idx):
+#         self.fixed_indices = (
+#             self.fixed_indices
+#             if hasattr(self, "fixed_indices")
+#             else self.indices[
+#                 idx * self.batch_size : (idx + 1) * self.batch_size
+#             ].copy()
+#         )
+#         batch_x = self.data[self.fixed_indices]
+#         batch_y = batch_x
+#         return batch_x, batch_y
+#     def on_epoch_end(self):
+#         np.random.shuffle(self.indices)
+# def load_kny_images(
+#     dataset_path: str, batch_size: int
+# ) -> Tuple[tf.data.Dataset, tf.data.Dataset, tuple]:
+#     import skvideo.io
+#     if os.path.exists(os.path.join(dataset_path, "kny", "kny_images.npy")):
+#         data = np.load(os.path.join(dataset_path, "kny", "kny_images.npy"))
+#     else:
+#         data = skvideo.io.vread(os.path.join(dataset_path, "kny", "01.mp4"))
+#     np.random.shuffle(data)
+#     def _preprocess(sample):
+#         image = tf.cast(sample, tf.float32) / 255.0  # Scale to unit interval.
+#         # video = video < tf.random.uniform(tf.shape(video))  # Randomly binarize.
+#         image = tf.image.resize(image, [64, 64])
+#         return image, image
+#     train_dataset = (
+#         tf.data.Dataset.from_tensor_slices(data[:5000])
+#         .map(_preprocess)
+#         .batch(batch_size)
+#         .prefetch(tf.data.AUTOTUNE)
+#         .shuffle(int(10e3))
+#     )
+#     test_dataset = (
+#         tf.data.Dataset.from_tensor_slices(data[5000:6000])
+#         .map(_preprocess)
+#         .batch(batch_size)
+#         .prefetch(tf.data.AUTOTUNE)
+#         .shuffle(int(10e3))
+#     )
+#     return train_dataset, test_dataset, data.shape[1:]
+# def load_moving_mnist_vae(
+#     dataset_path: str, batch_size: int
+# ) -> Tuple[tf.data.Dataset, tf.data.Dataset, tuple]:
+#     data = np.load(os.path.join(dataset_path, "moving_mnist", "mnist_test_seq.npy"))
+#     data.shape
+#     # We can see that data is of shape (window, n_samples, width, height)
+#     # But we want for keras something of shape (n_samples, window, width, height)
+#     data = np.moveaxis(data, 0, 1)
+#     # Also expand dimensions to have channels at the end (n_samples, window, width, height, channels)
+#     data = np.expand_dims(data, axis=-1)
+#     def _preprocess(sample):
+#         video = tf.cast(sample, tf.float32) / 255.0  # Scale to unit interval.
+#         # video = video < tf.random.uniform(tf.shape(video))  # Randomly binarize.
+#         return video, video
+#     train_dataset = (
+#         tf.data.Dataset.from_tensor_slices(data[:9000])
+#         .map(_preprocess)
+#         .batch(batch_size)
+#         .prefetch(tf.data.AUTOTUNE)
+#         .shuffle(int(10e3))
+#     )
+#     test_dataset = (
+#         tf.data.Dataset.from_tensor_slices(data[9000:])
+#         .map(_preprocess)
+#         .batch(batch_size)
+#         .prefetch(tf.data.AUTOTUNE)
+#         .shuffle(int(10e3))
+#     )
+#     return train_dataset, test_dataset, data.shape[1:]
+# def load_moving_mnist(
+#     dataset_path: str, batch_size: int
+# ) -> Tuple[tf.data.Dataset, tf.data.Dataset, tuple]:
+#     data = np.load(os.path.join(dataset_path, "moving_mnist", "mnist_test_seq.npy"))
+#     data.shape
+#     # We can see that data is of shape (window, n_samples, width, height)
+#     # But we want for keras something of shape (n_samples, window, width, height)
+#     data = np.moveaxis(data, 0, 1)
+#     # Also expand dimensions to have channels at the end (n_samples, window, width, height, channels)
+#     data = np.expand_dims(data, axis=-1)
+#     def _preprocess(sample):
+#         video = tf.cast(sample, tf.float32) / 255.0  # Scale to unit interval.
+#         # video = video < tf.random.uniform(tf.shape(video))  # Randomly binarize.
+#         first_frame = video[0:1, ...]
+#         last_frame = video[-1:, ...]
+#         first_last = tf.concat([first_frame, last_frame], axis=0)
+#         return first_last, video
+#     train_dataset = (
+#         tf.data.Dataset.from_tensor_slices(data[:9000])
+#         .map(_preprocess)
+#         .batch(batch_size)
+#         .prefetch(tf.data.AUTOTUNE)
+#         .shuffle(int(10e3))
+#     )
+#     test_dataset = (
+#         tf.data.Dataset.from_tensor_slices(data[9000:])
+#         .map(_preprocess)
+#         .batch(batch_size)
+#         .prefetch(tf.data.AUTOTUNE)
+#         .shuffle(int(10e3))
+#     )
+#     return train_dataset, test_dataset, data.shape[1:]
+# def load_mnist(
+#     dataset_path: str, batch_size: int
+# ) -> Tuple[tf.data.Dataset, tf.data.Dataset, tuple]:
+#     data = np.load(os.path.join(dataset_path, "moving_mnist", "mnist_test_seq.npy"))
+#     data.shape
+#     # We can see that data is of shape (window, n_samples, width, height)
+#     # But we want for keras something of shape (n_samples, window, width, height)
+#     data = np.moveaxis(data, 0, 1)
+#     # Also expand dimensions to have channels at the end (n_samples, window, width, height, channels)
+#     data = np.expand_dims(data, axis=-1)
+#     def _preprocess(sample):
+#         video = tf.cast(sample, tf.float32) / 255.0  # Scale to unit interval.
+#         # video = video < tf.random.uniform(tf.shape(video))  # Randomly binarize.
+#         first_frame = video[0, ...]
+#         first_frame = tf.image.grayscale_to_rgb(first_frame)
+#         return first_frame, first_frame
+#     train_dataset = (
+#         tf.data.Dataset.from_tensor_slices(data[:9000])
+#         .map(_preprocess)
+#         .batch(batch_size)
+#         .prefetch(tf.data.AUTOTUNE)
+#         .shuffle(int(10e3))
+#     )
+#     test_dataset = (
+#         tf.data.Dataset.from_tensor_slices(data[9000:])
+#         .map(_preprocess)
+#         .batch(batch_size)
+#         .prefetch(tf.data.AUTOTUNE)
+#         .shuffle(int(10e3))
+#     )
+#     return train_dataset, test_dataset, data.shape[1:]
+def preprocess_image(element):
+    element = tf.reshape(element, (tf.shape(element)[0], tf.shape(element)[1], 3))
+    element = tf.cast(element, tf.float32) / 255.0
+    return element, element
+def load_kny_images_light(dataset_path, batch_size):
+    dataset_length = 34045
+    path = os.path.join(dataset_path, "kny", "images_tfrecords_light")
+    dataset = ImageDataset(path).load()
+    dataset = dataset.shuffle(
+        dataset_length, reshuffle_each_iteration=True, seed=10
+    ).map(preprocess_image, num_parallel_calls=tf.data.AUTOTUNE)
+    train_size = int(dataset_length * 0.8)
+    validation_size = int(dataset_length * 0.1)
+    train_ds = dataset.take(train_size)
+    validation_ds = dataset.skip(train_size).take(validation_size)
+    test_ds = dataset.skip(train_size + validation_size).take(validation_size)
+    train_ds = train_ds.batch(batch_size, drop_remainder=True).prefetch(
+        tf.data.AUTOTUNE
+    )
+    validation_ds = validation_ds.batch(batch_size, drop_remainder=True).prefetch(
+        tf.data.AUTOTUNE
+    )
+    test_ds = test_ds.batch(batch_size, drop_remainder=True).prefetch(tf.data.AUTOTUNE)
+    return train_ds, validation_ds, test_ds
+def load_kny_images(dataset_path, batch_size):
+    dataset_length = 52014
+    path = os.path.join(dataset_path, "kny", "images_tfrecords")
+    dataset = ImageDataset(path).load()
+    dataset = dataset.shuffle(dataset_length, reshuffle_each_iteration=True).map(
+        preprocess_image, num_parallel_calls=tf.data.AUTOTUNE
+    )
+    train_size = int(dataset_length * 0.8)
+    validation_size = int(dataset_length * 0.1)
+    train_ds = dataset.take(train_size)
+    validation_ds = dataset.skip(train_size).take(validation_size)
+    test_ds = dataset.skip(train_size + validation_size).take(validation_size)
+    train_ds = train_ds.batch(batch_size, drop_remainder=True).prefetch(
+        tf.data.AUTOTUNE
+    )
+    validation_ds = validation_ds.batch(batch_size, drop_remainder=True).prefetch(
+        tf.data.AUTOTUNE
+    )
+    test_ds = test_ds.batch(batch_size, drop_remainder=True).prefetch(tf.data.AUTOTUNE)
+    return train_ds, validation_ds, test_ds
+def load_dataset(
+    dataset_name: str, dataset_path: str, batch_size: int
+) -> Tuple[tf.data.Dataset, tf.data.Dataset, tf.data.Dataset]:
+    # if dataset_name == "moving_mnist_vae":
+    #     return load_moving_mnist_vae(dataset_path, batch_size)
+    # elif dataset_name == "moving_mnist":
+    #     return load_moving_mnist(dataset_path, batch_size)
+    # elif dataset_name == "mnist":
+    #     return load_mnist(dataset_path, batch_size)
+    # elif dataset_name == "kny_images":
+    #     return load_kny_images(dataset_path, batch_size)
+    if dataset_name == "kny_images":
+        return load_kny_images(dataset_path, batch_size)
+    if dataset_name == "kny_images_light":
+        return load_kny_images_light(dataset_path, batch_size)
+    else:
+        raise ValueError(f"Unknown dataset: {dataset_name}")

ganime/data/experimental.py ADDED Viewed

	@@ -0,0 +1,222 @@

+from abc import ABC, abstractclassmethod, abstractmethod
+import glob
+import math
+import os
+from typing import Dict
+from typing_extensions import dataclass_transform
+import numpy as np
+import tensorflow as tf
+from tqdm.auto import tqdm
+def _bytes_feature(value):
+    """Returns a bytes_list from a string / byte."""
+    if isinstance(value, type(tf.constant(0))):  # if value ist tensor
+        value = value.numpy()  # get value of tensor
+    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
+def _float_feature(value):
+    """Returns a floast_list from a float / double."""
+    return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))
+def _int64_feature(value):
+    """Returns an int64_list from a bool / enum / int / uint."""
+    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
+def serialize_array(array):
+    array = tf.io.serialize_tensor(array)
+    return array
+class Dataset(ABC):
+    def __init__(self, dataset_path: str):
+        self.dataset_path = dataset_path
+    @classmethod
+    def _parse_single_element(cls, element) -> tf.train.Example:
+        features = tf.train.Features(feature=cls._get_features(element))
+        return tf.train.Example(features=features)
+    @abstractclassmethod
+    def _get_features(cls, element) -> Dict[str, tf.train.Feature]:
+        pass
+    @abstractclassmethod
+    def _parse_tfr_element(cls, element):
+        pass
+    @classmethod
+    def write_to_tfr(cls, data: np.ndarray, out_dir: str, filename: str):
+        if not os.path.exists(out_dir):
+            os.makedirs(out_dir)
+        # Write all elements to a single tfrecord file
+        single_file_name = cls.__write_to_single_tfr(data, out_dir, filename)
+        # The optimal size for a single tfrecord file is around 100 MB. Get the number of files that need to be created
+        number_splits = cls.__get_number_splits(single_file_name)
+        if number_splits > 1:
+            os.remove(single_file_name)
+            cls.__write_to_multiple_tfr(data, out_dir, filename, number_splits)
+    @classmethod
+    def __write_to_multiple_tfr(
+        cls, data: np.array, out_dir: str, filename: str, n_splits: int
+    ):
+        file_count = 0
+        max_files = math.ceil(data.shape[0] / n_splits)
+        print(f"Creating {n_splits} files with {max_files} elements each.")
+        for i in tqdm(range(n_splits)):
+            current_shard_name = os.path.join(
+                out_dir,
+                f"{filename}.tfrecords-{str(i).zfill(len(str(n_splits)))}-of-{n_splits}",
+            )
+            writer = tf.io.TFRecordWriter(current_shard_name)
+            current_shard_count = 0
+            while current_shard_count < max_files:  # as long as our shard is not full
+                # get the index of the file that we want to parse now
+                index = i * max_files + current_shard_count
+                if index >= len(
+                    data
+                ):  # when we have consumed the whole data, preempt generation
+                    break
+                current_element = data[index]
+                # create the required Example representation
+                out = cls._parse_single_element(element=current_element)
+                writer.write(out.SerializeToString())
+                current_shard_count += 1
+                file_count += 1
+        writer.close()
+        print(f"\nWrote {file_count} elements to TFRecord")
+        return file_count
+    @classmethod
+    def __get_number_splits(cls, filename: str):
+        target_size = 100 * 1024 * 1024  # 100mb
+        single_file_size = os.path.getsize(filename)
+        number_splits = math.ceil(single_file_size / target_size)
+        return number_splits
+    @classmethod
+    def __write_to_single_tfr(cls, data: np.array, out_dir: str, filename: str):
+        current_path_name = os.path.join(
+            out_dir,
+            f"{filename}.tfrecords-0-of-1",
+        )
+        writer = tf.io.TFRecordWriter(current_path_name)
+        for element in tqdm(data):
+            writer.write(cls._parse_single_element(element).SerializeToString())
+        writer.close()
+        return current_path_name
+    def load(self) -> tf.data.TFRecordDataset:
+        path = self.dataset_path
+        dataset = None
+        if os.path.isdir(path):
+            dataset = self._load_folder(path)
+        elif os.path.isfile(path):
+            dataset = self._load_file(path)
+        else:
+            raise ValueError(f"Path {path} is not a valid file or folder.")
+        dataset = dataset.map(self._parse_tfr_element)
+        return dataset
+    def _load_file(self, path) -> tf.data.TFRecordDataset:
+        return tf.data.TFRecordDataset(path)
+    def _load_folder(self, path) -> tf.data.TFRecordDataset:
+        return tf.data.TFRecordDataset(
+            glob.glob(os.path.join(path, "**/*.tfrecords*"), recursive=True)
+        )
+class VideoDataset(Dataset):
+    @classmethod
+    def _get_features(cls, element) -> Dict[str, tf.train.Feature]:
+        return {
+            "frames": _int64_feature(element.shape[0]),
+            "height": _int64_feature(element.shape[1]),
+            "width": _int64_feature(element.shape[2]),
+            "depth": _int64_feature(element.shape[3]),
+            "raw_video": _bytes_feature(serialize_array(element)),
+        }
+    @classmethod
+    def _parse_tfr_element(cls, element):
+        # use the same structure as above; it's kinda an outline of the structure we now want to create
+        data = {
+            "frames": tf.io.FixedLenFeature([], tf.int64),
+            "height": tf.io.FixedLenFeature([], tf.int64),
+            "width": tf.io.FixedLenFeature([], tf.int64),
+            "raw_video": tf.io.FixedLenFeature([], tf.string),
+            "depth": tf.io.FixedLenFeature([], tf.int64),
+        }
+        content = tf.io.parse_single_example(element, data)
+        frames = content["frames"]
+        height = content["height"]
+        width = content["width"]
+        depth = content["depth"]
+        raw_video = content["raw_video"]
+        # get our 'feature'-- our image -- and reshape it appropriately
+        feature = tf.io.parse_tensor(raw_video, out_type=tf.uint8)
+        feature = tf.reshape(feature, shape=[frames, height, width, depth])
+        return feature
+class ImageDataset(Dataset):
+    @classmethod
+    def _get_features(cls, element) -> Dict[str, tf.train.Feature]:
+        return {
+            "height": _int64_feature(element.shape[0]),
+            "width": _int64_feature(element.shape[1]),
+            "depth": _int64_feature(element.shape[2]),
+            "raw_image": _bytes_feature(serialize_array(element)),
+        }
+    @classmethod
+    def _parse_tfr_element(cls, element):
+        # use the same structure as above; it's kinda an outline of the structure we now want to create
+        data = {
+            "height": tf.io.FixedLenFeature([], tf.int64),
+            "width": tf.io.FixedLenFeature([], tf.int64),
+            "raw_image": tf.io.FixedLenFeature([], tf.string),
+            "depth": tf.io.FixedLenFeature([], tf.int64),
+        }
+        content = tf.io.parse_single_example(element, data)
+        height = content["height"]
+        width = content["width"]
+        depth = content["depth"]
+        raw_image = content["raw_image"]
+        # get our 'feature'-- our image -- and reshape it appropriately
+        feature = tf.io.parse_tensor(raw_image, out_type=tf.uint8)
+        feature = tf.reshape(feature, shape=[height, width, depth])
+        return feature

ganime/data/kny.py ADDED Viewed

	@@ -0,0 +1,19 @@

+import os
+import numpy as np
+from .base import SequenceDataset
+class KNYImage(SequenceDataset):
+    def load_data(self, dataset_path: str, split: str) -> np.ndarray:
+        data = np.load(os.path.join(dataset_path, "kny", "kny_images_64x128.npy"))
+        if split == "train":
+            data = data[:-5000]
+        else:
+            data = data[-5000:]
+        return data
+    def preprocess_data(self, data: np.ndarray) -> np.ndarray:
+        return data / 255

ganime/data/mnist.py ADDED Viewed

	@@ -0,0 +1,103 @@

+import glob
+import os
+from typing import Literal
+import numpy as np
+from .base import SequenceDataset
+import math
+class MovingMNISTImage(SequenceDataset):
+    def load_data(self, dataset_path: str, split: str) -> np.ndarray:
+        data = np.load(os.path.join(dataset_path, "moving_mnist", "mnist_test_seq.npy"))
+        # Data is of shape (window, n_samples, width, height)
+        # But we want for keras something of shape (n_samples, window, width, height)
+        data = np.moveaxis(data, 0, 1)
+        # Also expand dimensions to have channels at the end (n_samples, window, width, height, channels)
+        data = np.expand_dims(data, axis=-1)
+        if split == "train":
+            data = data[:-1000]
+        else:
+            data = data[-1000:]
+        data = np.concatenate([data, data, data], axis=-1)
+        return data
+    def __getitem__(self, idx):
+        inds = self.indices[idx * self.batch_size : (idx + 1) * self.batch_size]
+        batch_x = self.data[inds, 0, ...]
+        batch_y = self.data[inds, 1, ...]
+        return batch_x, batch_y
+    def preprocess_data(self, data: np.ndarray) -> np.ndarray:
+        return data / 255
+class MovingMNIST(SequenceDataset):
+    def __init__(
+        self,
+        dataset_path: str,
+        batch_size: int,
+        split: Literal["train", "validation", "test"] = "train",
+    ):
+        self.batch_size = batch_size
+        self.split = split
+        root_path = os.path.join(dataset_path, "moving_mnist", split)
+        self.paths = glob.glob(os.path.join(root_path, "*.npy"))
+        # self.data = self.preprocess_data(self.data)
+        self.indices = np.arange(len(self.paths))
+        self.on_epoch_end()
+    # def load_data(self, dataset_path: str, split: str) -> np.ndarray:
+    # data = np.load(os.path.join(dataset_path, "moving_mnist", "mnist_test_seq.npy"))
+    # # Data is of shape (window, n_samples, width, height)
+    # # But we want for keras something of shape (n_samples, window, width, height)
+    # data = np.moveaxis(data, 0, 1)
+    # # Also expand dimensions to have channels at the end (n_samples, window, width, height, channels)
+    # data = np.expand_dims(data, axis=-1)
+    # if split == "train":
+    #     data = data[:100]
+    # else:
+    #     data = data[100:110]
+    # data = np.concatenate([data, data, data], axis=-1)
+    # return data
+    def __len__(self):
+        return math.ceil(len(self.paths) / self.batch_size)
+    def __getitem__(self, idx):
+        inds = self.indices[idx * self.batch_size : (idx + 1) * self.batch_size]
+        data = self.load_indices(inds)
+        batch_x = np.concatenate([data[:, 0:1, ...], data[:, -1:, ...]], axis=1)
+        batch_y = data[:, 1:, ...]
+        return batch_x, batch_y
+    def get_fixed_batch(self, idx):
+        self.fixed_indices = (
+            self.fixed_indices
+            if hasattr(self, "fixed_indices")
+            else self.indices[
+                idx * self.batch_size : (idx + 1) * self.batch_size
+            ].copy()
+        )
+        data = self.load_indices(self.fixed_indices)
+        batch_x = np.concatenate([data[:, 0:1, ...], data[:, -1:, ...]], axis=1)
+        batch_y = data[:, 1:, ...]
+        return batch_x, batch_y
+    def load_indices(self, indices):
+        paths_to_load = [self.paths[index] for index in indices]
+        data = [np.load(path) for path in paths_to_load]
+        data = np.array(data)
+        return self.preprocess_data(data)
+    def preprocess_data(self, data: np.ndarray) -> np.ndarray:
+        return data / 255

ganime/metrics/image.py ADDED Viewed

	@@ -0,0 +1,70 @@

+import numpy as np
+import tensorflow as tf
+from scipy import linalg
+from tensorflow.keras.applications.inception_v3 import InceptionV3, preprocess_input
+from tqdm.auto import tqdm
+inceptionv3 = InceptionV3(include_top=False, weights="imagenet", pooling="avg")
+def resize_images(images, new_shape):
+    images = tf.image.resize(images, new_shape)
+    return images
+def calculate_fid(real_embeddings, generated_embeddings):
+    # calculate mean and covariance statistics
+    mu1, sigma1 = real_embeddings.mean(axis=0), np.cov(real_embeddings, rowvar=False)
+    mu2, sigma2 = generated_embeddings.mean(axis=0), np.cov(
+        generated_embeddings, rowvar=False
+    )
+    # calculate sum squared difference between means
+    ssdiff = np.sum((mu1 - mu2) ** 2.0)
+    # calculate sqrt of product between cov
+    covmean = linalg.sqrtm(sigma1.dot(sigma2))
+    # check and correct imaginary numbers from sqrt
+    if np.iscomplexobj(covmean):
+        covmean = covmean.real
+    # calculate score
+    fid = ssdiff + np.trace(sigma1 + sigma2 - 2.0 * covmean)
+    return fid
+def calculate_images_metrics(dataset, model, total_length):
+    fake_embeddings = []
+    real_embeddings = []
+    psnrs = []
+    ssims = []
+    for sample in tqdm(dataset, total=total_length):
+        generated = model(sample[0], training=False)[0]
+        generated, real = generated, sample[0]
+        real_resized = resize_images(real, (299, 299))
+        generated_resized = resize_images(generated, (299, 299))
+        real_activations = inceptionv3(real_resized, training=False)
+        generated_activations = inceptionv3(generated_resized, training=False)
+        fake_embeddings.append(generated_activations)
+        real_embeddings.append(real_activations)
+        fake_scaled = tf.cast(((generated * 0.5) + 1) * 255, tf.uint8)
+        real_scaled = tf.cast(((real * 0.5) + 1) * 255, tf.uint8)
+        psnrs.append(tf.image.psnr(fake_scaled, real_scaled, 255).numpy())
+        ssims.append(tf.image.ssim(fake_scaled, real_scaled, 255).numpy())
+    fid = calculate_fid(
+        tf.concat(fake_embeddings, axis=0).numpy(),
+        tf.concat(real_embeddings, axis=0).numpy(),
+    )
+    # kid = calculate_kid(
+    #     tf.concat(fake_embeddings, axis=0).numpy(),
+    #     tf.concat(real_embeddings, axis=0).numpy(),
+    # )
+    psnr = np.array(psnrs).mean()
+    ssim = np.array(ssims).mean()
+    return {"fid": fid, "ssim": ssim, "psnr": psnr}

ganime/metrics/video.py ADDED Viewed

	@@ -0,0 +1,98 @@

+import numpy as np
+import tensorflow as tf
+import tensorflow_gan as tfgan
+import tensorflow_hub as hub
+from sklearn.metrics.pairwise import polynomial_kernel
+from tqdm.auto import tqdm
+i3d = hub.KerasLayer("https://tfhub.dev/deepmind/i3d-kinetics-400/1")
+def resize_videos(videos, target_resolution):
+    """Runs some preprocessing on the videos for I3D model.
+    Args:
+        videos: <T>[batch_size, num_frames, height, width, depth] The videos to be
+            preprocessed. We don't care about the specific dtype of the videos, it can
+            be anything that tf.image.resize_bilinear accepts. Values are expected to
+            be in [-1, 1].
+        target_resolution: (width, height): target video resolution
+    Returns:
+        videos: <float32>[batch_size, num_frames, height, width, depth]
+    """
+    min_frames = 9
+    B, T, H, W, C = videos.shape
+    videos = tf.transpose(videos, (1, 0, 2, 3, 4))
+    if T < min_frames:
+        videos = tf.concat([tf.zeros((min_frames - T, B, H, W, C)), videos], axis=0)
+    scaled_videos = tf.map_fn(lambda x: tf.image.resize(x, target_resolution), videos)
+    scaled_videos = tf.transpose(scaled_videos, (1, 0, 2, 3, 4))
+    return scaled_videos
+def polynomial_mmd(X, Y):
+    m = X.shape[0]
+    n = Y.shape[0]
+    # compute kernels
+    K_XX = polynomial_kernel(X)
+    K_YY = polynomial_kernel(Y)
+    K_XY = polynomial_kernel(X, Y)
+    # compute mmd distance
+    K_XX_sum = (K_XX.sum() - np.diagonal(K_XX).sum()) / (m * (m - 1))
+    K_YY_sum = (K_YY.sum() - np.diagonal(K_YY).sum()) / (n * (n - 1))
+    K_XY_sum = K_XY.sum() / (m * n)
+    mmd = K_XX_sum + K_YY_sum - 2 * K_XY_sum
+    return mmd
+def calculate_ssim_videos(fake, real):
+    fake = tf.cast(((fake * 0.5) + 1) * 255, tf.uint8)
+    real = tf.cast(((real * 0.5) + 1) * 255, tf.uint8)
+    ssims = []
+    for i in range(fake.shape[0]):
+        ssims.append(tf.image.ssim(fake[i], real[i], 255).numpy().mean())
+    return np.array(ssims).mean()
+def calculate_psnr_videos(fake, real):
+    fake = tf.cast(((fake * 0.5) + 1) * 255, tf.uint8)
+    real = tf.cast(((real * 0.5) + 1) * 255, tf.uint8)
+    psnrs = []
+    for i in range(fake.shape[0]):
+        psnrs.append(tf.image.psnr(fake[i], real[i], 255).numpy().mean())
+    return np.array(psnrs).mean()
+def calculate_videos_metrics(dataset, model, total_length):
+    fake_embeddings = []
+    real_embeddings = []
+    psnrs = []
+    ssims = []
+    for sample in tqdm(dataset, total=total_length):
+        generated = model(sample, training=False)
+        generated, real = generated[:, 1:], sample["y"][:, 1:]  # ignore first frame
+        real_resized = resize_videos(real, (224, 224))
+        generated_resized = resize_videos(generated, (224, 224))
+        real_activations = i3d(real_resized)
+        generated_activations = i3d(generated_resized)
+        fake_embeddings.append(generated_activations)
+        real_embeddings.append(real_activations)
+        psnrs.append(calculate_psnr_videos(generated, real))
+        ssims.append(calculate_ssim_videos(generated, real))
+    # fake_concat, real_concat = tf.concat(fake_embeddings, axis=0), tf.concat(real_embeddings, axis=0)
+    fvd = tfgan.eval.frechet_classifier_distance_from_activations(
+        tf.concat(fake_embeddings, axis=0), tf.concat(real_embeddings, axis=0)
+    )
+    kvd = polynomial_mmd(
+        tf.concat(fake_embeddings, axis=0), tf.concat(real_embeddings, axis=0)
+    )
+    psnr = np.array(psnrs).mean()
+    ssim = np.array(ssims).mean()
+    return {"fvd": fvd, "kvd": kvd, "ssim": ssim, "psnr": psnr}

ganime/model/__init__.py ADDED Viewed

File without changes

ganime/model/base.py ADDED Viewed

	@@ -0,0 +1,45 @@

+import tensorflow as tf
+from ganime.model.vqgan_clean.vqgan import VQGAN
+def load_model(
+    model: str, config: dict, strategy: tf.distribute.Strategy
+) -> tf.keras.Model:
+    if model == "vqgan":
+        with strategy.scope():
+            print(config["model"])
+            model = VQGAN(**config["model"])
+            gen_optimizer = tf.keras.optimizers.Adam(
+                learning_rate=config["trainer"]["gen_lr"],
+                beta_1=config["trainer"]["gen_beta_1"],
+                beta_2=config["trainer"]["gen_beta_2"],
+                clipnorm=config["trainer"]["gen_clip_norm"],
+            )
+            disc_optimizer = tf.keras.optimizers.Adam(
+                learning_rate=config["trainer"]["disc_lr"],
+                beta_1=config["trainer"]["disc_beta_1"],
+                beta_2=config["trainer"]["disc_beta_2"],
+                clipnorm=config["trainer"]["disc_clip_norm"],
+            )
+            model.compile(gen_optimizer=gen_optimizer, disc_optimizer=disc_optimizer)
+        return model
+    else:
+        raise ValueError(f"Unknown model: {model}")
+    # if model == "moving_vae":
+    #     from ganime.model.moving_vae import MovingVAE
+    #     with strategy.scope():
+    #         model = MovingVAE(input_shape=input_shape)
+    #         negloglik = lambda x, rv_x: -rv_x.log_prob(x)
+    #         model.compile(
+    #             optimizer=tf.optimizers.Adam(learning_rate=config["lr"]),
+    #             loss=negloglik,
+    #         )
+    #         # model.build(input_shape=(None, *input_shape))
+    #         # model.summary()
+    #     return model

ganime/model/moving_vae.py ADDED Viewed

	@@ -0,0 +1,126 @@

+from tensorflow.keras import Model
+import tensorflow as tf
+import tensorflow_probability as tfp
+class MovingVAE(Model):
+    def __init__(self, input_shape, encoded_size=64, base_depth=32):
+        super().__init__()
+        self.encoded_size = encoded_size
+        self.base_depth = base_depth
+        self.prior = tfp.distributions.Independent(
+            tfp.distributions.Normal(loc=tf.zeros(encoded_size), scale=1),
+            reinterpreted_batch_ndims=1,
+        )
+        self.encoder = tf.keras.Sequential(
+            [
+                tf.keras.layers.InputLayer(input_shape=input_shape),
+                tf.keras.layers.Lambda(lambda x: tf.cast(x, tf.float32) - 0.5),
+                tf.keras.layers.Conv3D(
+                    self.base_depth,
+                    5,
+                    strides=1,
+                    padding="same",
+                    activation=tf.nn.leaky_relu,
+                ),
+                tf.keras.layers.Conv3D(
+                    self.base_depth,
+                    5,
+                    strides=2,
+                    padding="same",
+                    activation=tf.nn.leaky_relu,
+                ),
+                tf.keras.layers.Conv3D(
+                    2 * self.base_depth,
+                    5,
+                    strides=1,
+                    padding="same",
+                    activation=tf.nn.leaky_relu,
+                ),
+                tf.keras.layers.Conv3D(
+                    2 * self.base_depth,
+                    5,
+                    strides=2,
+                    padding="same",
+                    activation=tf.nn.leaky_relu,
+                ),
+                # tf.keras.layers.Conv3D(4 * encoded_size, 7, strides=1,
+                #            padding='valid', activation=tf.nn.leaky_relu),
+                tf.keras.layers.Flatten(),
+                tf.keras.layers.Dense(
+                    tfp.layers.MultivariateNormalTriL.params_size(self.encoded_size),
+                    activation=None,
+                ),
+                tfp.layers.MultivariateNormalTriL(
+                    self.encoded_size,
+                    activity_regularizer=tfp.layers.KLDivergenceRegularizer(self.prior),
+                ),
+            ]
+        )
+        self.decoder = tf.keras.Sequential(
+            [
+                tf.keras.layers.InputLayer(input_shape=[self.encoded_size]),
+                tf.keras.layers.Reshape([1, 1, 1, self.encoded_size]),
+                tf.keras.layers.Conv3DTranspose(
+                    self.base_depth,
+                    (5, 4, 4),
+                    strides=1,
+                    padding="valid",
+                    activation=tf.nn.leaky_relu,
+                ),
+                tf.keras.layers.Conv3DTranspose(
+                    2 * self.base_depth,
+                    (5, 4, 4),
+                    strides=(1, 2, 2),
+                    padding="same",
+                    activation=tf.nn.leaky_relu,
+                ),
+                tf.keras.layers.Conv3DTranspose(
+                    2 * self.base_depth,
+                    (5, 4, 4),
+                    strides=2,
+                    padding="same",
+                    activation=tf.nn.leaky_relu,
+                ),
+                tf.keras.layers.Conv3DTranspose(
+                    self.base_depth,
+                    (5, 4, 4),
+                    strides=(1, 2, 2),
+                    padding="same",
+                    activation=tf.nn.leaky_relu,
+                ),
+                tf.keras.layers.Conv3DTranspose(
+                    self.base_depth,
+                    (5, 4, 4),
+                    strides=2,
+                    padding="same",
+                    activation=tf.nn.leaky_relu,
+                ),
+                tf.keras.layers.Conv3DTranspose(
+                    self.base_depth,
+                    (5, 4, 4),
+                    strides=1,
+                    padding="same",
+                    activation=tf.nn.leaky_relu,
+                ),
+                tf.keras.layers.Conv2D(
+                    filters=1, kernel_size=5, strides=1, padding="same", activation=None
+                ),
+                tf.keras.layers.Flatten(),
+                tfp.layers.IndependentBernoulli(
+                    input_shape, tfp.distributions.Bernoulli.logits
+                ),
+            ]
+        )
+        self.model = tf.keras.Model(
+            inputs=self.encoder.inputs, outputs=self.decoder(self.encoder.outputs[0])
+        )
+    def call(self, inputs):
+        return self.model(inputs)

ganime/model/p2p/__init__.py ADDED Viewed

File without changes

ganime/model/p2p/p2p.py ADDED Viewed

	@@ -0,0 +1,543 @@

+from statistics import mode
+import numpy as np
+import tensorflow as tf
+from tensorflow.python.keras import Model, Sequential
+from tensorflow.python.keras.layers import Dense, LSTMCell, RNN, Conv2D, Conv2DTranspose
+from tensorflow.keras.layers import BatchNormalization, TimeDistributed
+from tensorflow.python.keras.layers.advanced_activations import LeakyReLU
+from tensorflow.keras.layers import Activation
+# from tensorflow_probability.python.layers.dense_variational import (
+#     DenseReparameterization,
+# )
+# import tensorflow_probability as tfp
+from tensorflow.keras.losses import Loss
+class KLCriterion(Loss):
+    def call(self, y_true, y_pred):
+        (mu1, logvar1), (mu2, logvar2) = y_true, y_pred
+        """KL( N(mu_1, sigma2_1) || N(mu_2, sigma2_2))"""
+        sigma1 = tf.exp(tf.math.multiply(logvar1, 0.5))
+        sigma2 = tf.exp(tf.math.multiply(logvar2, 0.5))
+        kld = (
+            tf.math.log(sigma2 / sigma1)
+            + (tf.exp(logvar1) + tf.square(mu1 - mu2)) / (2 * tf.exp(logvar2))
+            - 0.5
+        )
+        return tf.reduce_sum(kld) / 22
+class Encoder(Model):
+    def __init__(self, dim, nc=1):
+        super().__init__()
+        self.dim = dim
+        self.c1 = Sequential(
+            [
+                Conv2D(64, kernel_size=4, strides=2, padding="same"),
+                BatchNormalization(),
+                LeakyReLU(alpha=0.2),
+            ]
+        )
+        self.c2 = Sequential(
+            [
+                Conv2D(128, kernel_size=4, strides=2, padding="same"),
+                BatchNormalization(),
+                LeakyReLU(alpha=0.2),
+            ]
+        )
+        self.c3 = Sequential(
+            [
+                Conv2D(256, kernel_size=4, strides=2, padding="same"),
+                BatchNormalization(),
+                LeakyReLU(alpha=0.2),
+            ]
+        )
+        self.c4 = Sequential(
+            [
+                Conv2D(512, kernel_size=4, strides=2, padding="same"),
+                BatchNormalization(),
+                LeakyReLU(alpha=0.2),
+            ]
+        )
+        self.c5 = Sequential(
+            [
+                Conv2D(self.dim, kernel_size=4, strides=1, padding="valid"),
+                BatchNormalization(),
+                Activation("tanh"),
+            ]
+        )
+    def call(self, input):
+        h1 = self.c1(input)
+        h2 = self.c2(h1)
+        h3 = self.c3(h2)
+        h4 = self.c4(h3)
+        h5 = self.c5(h4)
+        return tf.reshape(h5, (-1, self.dim)), [h1, h2, h3, h4, h5]
+class Decoder(Model):
+    def __init__(self, dim, nc=1):
+        super().__init__()
+        self.dim = dim
+        self.upc1 = Sequential(
+            [
+                Conv2DTranspose(512, kernel_size=4, strides=1, padding="valid"),
+                BatchNormalization(),
+                LeakyReLU(alpha=0.2),
+            ]
+        )
+        self.upc2 = Sequential(
+            [
+                Conv2DTranspose(256, kernel_size=4, strides=2, padding="same"),
+                BatchNormalization(),
+                LeakyReLU(alpha=0.2),
+            ]
+        )
+        self.upc3 = Sequential(
+            [
+                Conv2DTranspose(128, kernel_size=4, strides=2, padding="same"),
+                BatchNormalization(),
+                LeakyReLU(alpha=0.2),
+            ]
+        )
+        self.upc4 = Sequential(
+            [
+                Conv2DTranspose(64, kernel_size=4, strides=2, padding="same"),
+                BatchNormalization(),
+                LeakyReLU(alpha=0.2),
+            ]
+        )
+        self.upc5 = Sequential(
+            [
+                Conv2DTranspose(1, kernel_size=4, strides=2, padding="same"),
+                Activation("sigmoid"),
+            ]
+        )
+    def call(self, input):
+        vec, skip = input
+        d1 = self.upc1(tf.reshape(vec, (-1, 1, 1, self.dim)))
+        d2 = self.upc2(tf.concat([d1, skip[3]], axis=-1))
+        d3 = self.upc3(tf.concat([d2, skip[2]], axis=-1))
+        d4 = self.upc4(tf.concat([d3, skip[1]], axis=-1))
+        output = self.upc5(tf.concat([d4, skip[0]], axis=-1))
+        return output
+class MyLSTM(Model):
+    def __init__(self, input_shape, hidden_size, output_size, n_layers):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.n_layers = n_layers
+        self.embed = Dense(hidden_size, input_dim=input_shape)
+        # self.lstm = Sequential(
+        #     [LSTMCell(hidden_size) for _ in range(n_layers)], name="lstm"
+        # )
+        # self.lstm = self.create_lstm(hidden_size, n_layers)
+        self.lstm = LSTMCell(hidden_size)
+        self.out = Dense(output_size)
+    def init_hidden(self, batch_size):
+        hidden = []
+        for i in range(self.n_layers):
+            hidden.append(
+                (
+                    tf.Variable(tf.zeros([batch_size, self.hidden_size])),
+                    tf.Variable(tf.zeros([batch_size, self.hidden_size])),
+                )
+            )
+        self.__dict__["hidden"] = hidden
+    def build(self, input_shape):
+        self.init_hidden(input_shape[0])
+    def call(self, inputs):
+        h_in = self.embed(inputs)
+        for i in range(self.n_layers):
+            _, self.hidden[i] = self.lstm(h_in, self.hidden[i])
+            h_in = self.hidden[i][0]
+        return self.out(h_in)
+class MyGaussianLSTM(Model):
+    def __init__(self, input_shape, hidden_size, output_size, n_layers):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.n_layers = n_layers
+        self.embed = Dense(hidden_size, input_dim=input_shape)
+        # self.lstm = Sequential(
+        #     [LSTMCell(hidden_size) for _ in range(n_layers)], name="lstm"
+        # )
+        self.lstm = LSTMCell(hidden_size)
+        self.mu_net = Dense(output_size)
+        self.logvar_net = Dense(output_size)
+        # self.out = Sequential(
+        #     [
+        #         tf.keras.layers.Dense(
+        #             tfp.layers.MultivariateNormalTriL.params_size(output_size),
+        #             activation=None,
+        #         ),
+        #         tfp.layers.MultivariateNormalTriL(output_size),
+        #     ]
+        # )
+    def reparameterize(self, mu, logvar: tf.Tensor):
+        logvar = tf.math.exp(logvar * 0.5)
+        eps = tf.random.normal(logvar.shape)
+        return tf.add(tf.math.multiply(eps, logvar), mu)
+    def init_hidden(self, batch_size):
+        hidden = []
+        for i in range(self.n_layers):
+            hidden.append(
+                (
+                    tf.Variable(tf.zeros([batch_size, self.hidden_size])),
+                    tf.Variable(tf.zeros([batch_size, self.hidden_size])),
+                )
+            )
+        self.__dict__["hidden"] = hidden
+    def build(self, input_shape):
+        self.init_hidden(input_shape[0])
+    def call(self, inputs):
+        h_in = self.embed(inputs)
+        for i in range(self.n_layers):
+            # print(h_in.shape, self.hidden[i][0].shape, self.hidden[i][0].shape)
+            _, self.hidden[i] = self.lstm(h_in, self.hidden[i])
+            h_in = self.hidden[i][0]
+        mu = self.mu_net(h_in)
+        logvar = self.logvar_net(h_in)
+        z = self.reparameterize(mu, logvar)
+        return z, mu, logvar
+class P2P(Model):
+    def __init__(
+        self,
+        channels: int = 1,
+        g_dim: int = 128,
+        z_dim: int = 10,
+        rnn_size: int = 256,
+        prior_rnn_layers: int = 1,
+        posterior_rnn_layers: int = 1,
+        predictor_rnn_layers: float = 1,
+        skip_prob: float = 0.5,
+        n_past: int = 1,
+        last_frame_skip: bool = False,
+        beta: float = 0.0001,
+        weight_align: float = 0.1,
+        weight_cpc: float = 100,
+    ):
+        super().__init__()
+        self.channels = channels
+        self.g_dim = g_dim
+        self.z_dim = z_dim
+        self.rnn_size = rnn_size
+        self.prior_rnn_layers = prior_rnn_layers
+        self.posterior_rnn_layers = posterior_rnn_layers
+        self.predictor_rnn_layers = predictor_rnn_layers
+        self.skip_prob = skip_prob
+        self.n_past = n_past
+        self.last_frame_skip = last_frame_skip
+        self.beta = beta
+        self.weight_align = weight_align
+        self.weight_cpc = weight_cpc
+        self.frame_predictor = MyLSTM(
+            self.g_dim + self.z_dim + 1 + 1,
+            self.rnn_size,
+            self.g_dim,
+            self.predictor_rnn_layers,
+        )
+        self.prior = MyGaussianLSTM(
+            self.g_dim + self.g_dim + 1 + 1,
+            self.rnn_size,
+            self.z_dim,
+            self.prior_rnn_layers,
+        )
+        self.posterior = MyGaussianLSTM(
+            self.g_dim + self.g_dim + 1 + 1,
+            self.rnn_size,
+            self.z_dim,
+            self.posterior_rnn_layers,
+        )
+        self.encoder = Encoder(self.g_dim, self.channels)
+        self.decoder = Decoder(self.g_dim, self.channels)
+        # criterions
+        self.mse_criterion = tf.keras.losses.MeanSquaredError()
+        self.kl_criterion = KLCriterion()
+        self.align_criterion = tf.keras.losses.MeanSquaredError()
+        # optimizers
+        self.frame_predictor_optimizer = tf.keras.optimizers.Adam(
+            learning_rate=0.0001  # , beta_1=0.9, beta_2=0.999, epsilon=1e-8
+        )
+        self.posterior_optimizer = tf.keras.optimizers.Adam(
+            learning_rate=0.0001  # , beta_1=0.9, beta_2=0.999, epsilon=1e-8
+        )
+        self.prior_optimizer = tf.keras.optimizers.Adam(
+            learning_rate=0.0001  # , beta_1=0.9, beta_2=0.999, epsilon=1e-8
+        )
+        self.encoder_optimizer = tf.keras.optimizers.Adam(
+            learning_rate=0.0001  # , beta_1=0.9, beta_2=0.999, epsilon=1e-8
+        )
+        self.decoder_optimizer = tf.keras.optimizers.Adam(
+            learning_rate=0.0001  # , beta_1=0.9, beta_2=0.999, epsilon=1e-8
+        )
+    def get_global_descriptor(self, x, start_ix=0, cp_ix=None):
+        """Get the global descriptor based on x, start_ix, cp_ix."""
+        if cp_ix is None:
+            cp_ix = x.shape[1] - 1
+        x_cp = x[:, cp_ix, ...]
+        h_cp = self.encoder(x_cp)[0]  # 1 is input for skip-connection
+        return x_cp, h_cp
+    def call(self, x, start_ix=0, cp_ix=-1):
+        batch_size = x.shape[0]
+        with tf.GradientTape(persistent=True) as tape:
+            mse_loss = 0
+            kld_loss = 0
+            cpc_loss = 0
+            align_loss = 0
+            seq_len = x.shape[1]
+            start_ix = 0
+            cp_ix = seq_len - 1
+            x_cp, global_z = self.get_global_descriptor(
+                x, start_ix, cp_ix
+            )  # here global_z is h_cp
+            skip_prob = self.skip_prob
+            prev_i = 0
+            max_skip_count = seq_len * skip_prob
+            skip_count = 0
+            probs = np.random.uniform(low=0, high=1, size=seq_len - 1)
+            for i in range(1, seq_len):
+                if (
+                    probs[i - 1] <= skip_prob
+                    and i >= self.n_past
+                    and skip_count < max_skip_count
+                    and i != 1
+                    and i != cp_ix
+                ):
+                    skip_count += 1
+                    continue
+                time_until_cp = tf.fill([batch_size, 1], (cp_ix - i + 1) / cp_ix)
+                delta_time = tf.fill([batch_size, 1], ((i - prev_i) / cp_ix))
+                prev_i = i
+                h = self.encoder(x[:, i - 1, ...])
+                h_target = self.encoder(x[:, i, ...])[0]
+                if self.last_frame_skip or i <= self.n_past:
+                    h, skip = h
+                else:
+                    h = h[0]
+                # Control Point Aware
+                h_cpaw = tf.concat([h, global_z, time_until_cp, delta_time], axis=1)
+                h_target_cpaw = tf.concat(
+                    [h_target, global_z, time_until_cp, delta_time], axis=1
+                )
+                zt, mu, logvar = self.posterior(h_target_cpaw)
+                zt_p, mu_p, logvar_p = self.prior(h_cpaw)
+                concat = tf.concat([h, zt, time_until_cp, delta_time], axis=1)
+                h_pred = self.frame_predictor(concat)
+                x_pred = self.decoder([h_pred, skip])
+                if i == cp_ix:  # the gen-cp-frame should be exactly as x_cp
+                    h_pred_p = self.frame_predictor(
+                        tf.concat([h, zt_p, time_until_cp, delta_time], axis=1)
+                    )
+                    x_pred_p = self.decoder([h_pred_p, skip])
+                    cpc_loss = self.mse_criterion(x_pred_p, x_cp)
+                if i > 1:
+                    align_loss += self.align_criterion(h[0], h_pred)
+                mse_loss += self.mse_criterion(x_pred, x[:, i, ...])
+                kld_loss += self.kl_criterion((mu, logvar), (mu_p, logvar_p))
+            # backward
+            loss = mse_loss + kld_loss * self.beta + align_loss * self.weight_align
+            prior_loss = kld_loss + cpc_loss * self.weight_cpc
+        var_list_frame_predictor = self.frame_predictor.trainable_variables
+        var_list_posterior = self.posterior.trainable_variables
+        var_list_prior = self.prior.trainable_variables
+        var_list_encoder = self.encoder.trainable_variables
+        var_list_decoder = self.decoder.trainable_variables
+        # mse: frame_predictor + decoder
+        # align: frame_predictor + encoder
+        # kld: posterior + prior + encoder
+        var_list_without_prior = (
+            var_list_frame_predictor
+            + var_list_posterior
+            + var_list_encoder
+            + var_list_decoder
+        )
+        gradients_without_prior = tape.gradient(
+            loss,
+            var_list_without_prior,
+        )
+        gradients_prior = tape.gradient(
+            prior_loss,
+            var_list_prior,
+        )
+        self.update_model_without_prior(
+            gradients_without_prior,
+            var_list_without_prior,
+        )
+        self.update_prior(gradients_prior, var_list_prior)
+        del tape
+        return (
+            mse_loss / seq_len,
+            kld_loss / seq_len,
+            cpc_loss / seq_len,
+            align_loss / seq_len,
+        )
+    def p2p_generate(
+        self,
+        x,
+        len_output,
+        eval_cp_ix,
+        start_ix=0,
+        cp_ix=-1,
+        model_mode="full",
+        skip_frame=False,
+        init_hidden=True,
+    ):
+        batch_size, num_frames, h, w, channels = x.shape
+        dim_shape = (h, w, channels)
+        gen_seq = [x[:, 0, ...]]
+        x_in = x[:, 0, ...]
+        seq_len = x.shape[1]
+        cp_ix = seq_len - 1
+        x_cp, global_z = self.get_global_descriptor(
+            x, cp_ix=cp_ix
+        )  # here global_z is h_cp
+        skip_prob = self.skip_prob
+        prev_i = 0
+        max_skip_count = seq_len * skip_prob
+        skip_count = 0
+        probs = np.random.uniform(0, 1, len_output - 1)
+        for i in range(1, len_output):
+            if (
+                probs[i - 1] <= skip_prob
+                and i >= self.n_past
+                and skip_count < max_skip_count
+                and i != 1
+                and i != (len_output - 1)
+                and skip_frame
+            ):
+                skip_count += 1
+                gen_seq.append(tf.zeros_like(x_in))
+                continue
+            time_until_cp = tf.fill([batch_size, 1], (eval_cp_ix - i + 1) / eval_cp_ix)
+            delta_time = tf.fill([batch_size, 1], ((i - prev_i) / eval_cp_ix))
+            prev_i = i
+            h = self.encoder(x_in)
+            if self.last_frame_skip or i == 1 or i < self.n_past:
+                h, skip = h
+            else:
+                h, _ = h
+            h_cpaw = tf.concat([h, global_z, time_until_cp, delta_time], axis=1)
+            if i < self.n_past:
+                h_target = self.encoder(x[:, i, ...])[0]
+                h_target_cpaw = tf.concat(
+                    [h_target, global_z, time_until_cp, delta_time], axis=1
+                )
+                zt, _, _ = self.posterior(h_target_cpaw)
+                zt_p, _, _ = self.prior(h_cpaw)
+                if model_mode == "posterior" or model_mode == "full":
+                    self.frame_predictor(
+                        tf.concat([h, zt, time_until_cp, delta_time], axis=1)
+                    )
+                elif model_mode == "prior":
+                    self.frame_predictor(
+                        tf.concat([h, zt_p, time_until_cp, delta_time], axis=1)
+                    )
+                x_in = x[:, i, ...]
+                gen_seq.append(x_in)
+            else:
+                if i < num_frames:
+                    h_target = self.encoder(x[:, i, ...])[0]
+                    h_target_cpaw = tf.concat(
+                        [h_target, global_z, time_until_cp, delta_time], axis=1
+                    )
+                else:
+                    h_target_cpaw = h_cpaw
+                zt, _, _ = self.posterior(h_target_cpaw)
+                zt_p, _, _ = self.prior(h_cpaw)
+                if model_mode == "posterior":
+                    h = self.frame_predictor(
+                        tf.concat([h, zt, time_until_cp, delta_time], axis=1)
+                    )
+                elif model_mode == "prior" or model_mode == "full":
+                    h = self.frame_predictor(
+                        tf.concat([h, zt_p, time_until_cp, delta_time], axis=1)
+                    )
+                x_in = self.decoder([h, skip])
+                gen_seq.append(x_in)
+        return tf.stack(gen_seq, axis=1)
+    def update_model_without_prior(self, gradients, var_list):
+        self.frame_predictor_optimizer.apply_gradients(zip(gradients, var_list))
+        self.posterior_optimizer.apply_gradients(zip(gradients, var_list))
+        self.encoder_optimizer.apply_gradients(zip(gradients, var_list))
+        self.decoder_optimizer.apply_gradients(zip(gradients, var_list))
+    def update_prior(self, gradients, var_list):
+        self.prior_optimizer.apply_gradients(zip(gradients, var_list))
+    # def update_model_without_prior(self):
+    #     self.frame_predictor_optimizer.step()
+    #     self.posterior_optimizer.step()
+    #     self.encoder_optimizer.step()
+    #     self.decoder_optimizer.step()

ganime/model/p2p/p2p_test.py ADDED Viewed

	@@ -0,0 +1,713 @@

+from tqdm.auto import tqdm
+import numpy as np
+import tensorflow as tf
+from tensorflow.keras import Model, Sequential
+from tensorflow.keras.layers import (
+    LSTM,
+    LSTMCell,
+    Activation,
+    BatchNormalization,
+    Conv2D,
+    Conv2DTranspose,
+    Conv3D,
+    Conv3DTranspose,
+    Dense,
+    Flatten,
+    Input,
+    Layer,
+    LeakyReLU,
+    MaxPooling2D,
+    Reshape,
+    TimeDistributed,
+    UpSampling2D,
+)
+from tensorflow.keras.losses import Loss
+from tensorflow.keras.losses import KLDivergence, MeanSquaredError
+# from tensorflow_probability.python.layers.dense_variational import (
+#     DenseReparameterization,
+# )
+# import tensorflow_probability as tfp
+from tensorflow.keras.losses import Loss
+initializer_conv_dense = tf.keras.initializers.RandomNormal(mean=0.0, stddev=0.02)
+initializer_batch_norm = tf.keras.initializers.RandomNormal(mean=1.0, stddev=0.02)
+class KLCriterion(Loss):
+    def call(self, y_true, y_pred):
+        (mu1, logvar1), (mu2, logvar2) = y_true, y_pred
+        """KL( N(mu_1, sigma2_1) || N(mu_2, sigma2_2))"""
+        sigma1 = tf.exp(tf.math.multiply(logvar1, 0.5))
+        sigma2 = tf.exp(tf.math.multiply(logvar2, 0.5))
+        kld = (
+            tf.math.log(sigma2 / sigma1)
+            + (tf.exp(logvar1) + tf.square(mu1 - mu2)) / (2 * tf.exp(logvar2))
+            - 0.5
+        )
+        return tf.reduce_sum(kld) / 100
+class Encoder(Model):
+    def __init__(self, dim, nc=1):
+        super().__init__()
+        self.dim = dim
+        self.c1 = Sequential(
+            [
+                Conv2D(
+                    64,
+                    kernel_size=4,
+                    strides=2,
+                    padding="same",
+                    kernel_initializer=initializer_conv_dense,
+                ),
+                # BatchNormalization(),
+                LeakyReLU(alpha=0.2),
+            ]
+        )
+        self.c2 = Sequential(
+            [
+                Conv2D(
+                    128,
+                    kernel_size=4,
+                    strides=2,
+                    padding="same",
+                    kernel_initializer=initializer_conv_dense,
+                ),
+                # BatchNormalization(),
+                LeakyReLU(alpha=0.2),
+            ]
+        )
+        self.c3 = Sequential(
+            [
+                Conv2D(
+                    256,
+                    kernel_size=4,
+                    strides=2,
+                    padding="same",
+                    kernel_initializer=initializer_conv_dense,
+                ),
+                # BatchNormalization(),
+                LeakyReLU(alpha=0.2),
+            ]
+        )
+        self.c4 = Sequential(
+            [
+                Conv2D(
+                    512,
+                    kernel_size=4,
+                    strides=2,
+                    padding="same",
+                    kernel_initializer=initializer_conv_dense,
+                ),
+                # BatchNormalization(),
+                LeakyReLU(alpha=0.2),
+            ]
+        )
+        self.c5 = Sequential(
+            [
+                Conv2D(
+                    self.dim,
+                    kernel_size=4,
+                    strides=1,
+                    padding="valid",
+                    kernel_initializer=initializer_conv_dense,
+                ),
+                # BatchNormalization(),
+                Activation("tanh"),
+            ]
+        )
+    def call(self, input):
+        h1 = self.c1(input)
+        h2 = self.c2(h1)
+        h3 = self.c3(h2)
+        h4 = self.c4(h3)
+        h5 = self.c5(h4)
+        return tf.reshape(h5, (-1, self.dim)), [h1, h2, h3, h4, h5]
+class Decoder(Model):
+    def __init__(self, dim, nc=1):
+        super().__init__()
+        self.dim = dim
+        self.upc1 = Sequential(
+            [
+                Conv2DTranspose(
+                    512,
+                    kernel_size=4,
+                    strides=1,
+                    padding="valid",
+                    kernel_initializer=initializer_conv_dense,
+                ),
+                # BatchNormalization(),
+                LeakyReLU(alpha=0.2),
+            ]
+        )
+        self.upc2 = Sequential(
+            [
+                Conv2DTranspose(
+                    256,
+                    kernel_size=4,
+                    strides=2,
+                    padding="same",
+                    kernel_initializer=initializer_conv_dense,
+                ),
+                # BatchNormalization(),
+                LeakyReLU(alpha=0.2),
+            ]
+        )
+        self.upc3 = Sequential(
+            [
+                Conv2DTranspose(
+                    128,
+                    kernel_size=4,
+                    strides=2,
+                    padding="same",
+                    kernel_initializer=initializer_conv_dense,
+                ),
+                # BatchNormalization(),
+                LeakyReLU(alpha=0.2),
+            ]
+        )
+        self.upc4 = Sequential(
+            [
+                Conv2DTranspose(
+                    64,
+                    kernel_size=4,
+                    strides=2,
+                    padding="same",
+                    kernel_initializer=initializer_conv_dense,
+                ),
+                # BatchNormalization(),
+                LeakyReLU(alpha=0.2),
+            ]
+        )
+        self.upc5 = Sequential(
+            [
+                Conv2DTranspose(
+                    1,
+                    kernel_size=4,
+                    strides=2,
+                    padding="same",
+                    kernel_initializer=initializer_conv_dense,
+                ),
+                Activation("sigmoid"),
+            ]
+        )
+    def call(self, input):
+        vec, skip = input
+        d1 = self.upc1(tf.reshape(vec, (-1, 1, 1, self.dim)))
+        d2 = self.upc2(tf.concat([d1, skip[3]], axis=-1))
+        d3 = self.upc3(tf.concat([d2, skip[2]], axis=-1))
+        d4 = self.upc4(tf.concat([d3, skip[1]], axis=-1))
+        output = self.upc5(tf.concat([d4, skip[0]], axis=-1))
+        return output
+class MyLSTM(Model):
+    def __init__(self, input_shape, hidden_size, output_size, n_layers):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.n_layers = n_layers
+        self.embed = Dense(
+            hidden_size,
+            input_dim=input_shape,
+            kernel_initializer=initializer_conv_dense,
+        )
+        # self.lstm = Sequential(
+        #     [LSTMCell(hidden_size) for _ in range(n_layers)], name="lstm"
+        # )
+        # self.lstm = self.create_lstm(hidden_size, n_layers)
+        self.lstm = [
+            LSTMCell(
+                hidden_size  # , return_sequences=False if i == self.n_layers - 1 else True
+            )
+            for i in range(self.n_layers)
+        ]  # LSTMCell(hidden_size)
+        self.lstm_rnn = tf.keras.layers.RNN(self.lstm[0], return_state=True)
+        self.out = Dense(output_size, kernel_initializer=initializer_conv_dense)
+    def init_hidden(self, batch_size):
+        hidden = []
+        for i in range(self.n_layers):
+            hidden.append(
+                (
+                    tf.Variable(tf.zeros([batch_size, self.hidden_size])),
+                    tf.Variable(tf.zeros([batch_size, self.hidden_size])),
+                )
+            )
+        self.__dict__["hidden"] = hidden
+    def build(self, input_shape):
+        self.init_hidden(input_shape[0])
+    def call(self, inputs):
+        h_in = self.embed(inputs)
+        h_in = tf.reshape(h_in, (-1, 1, self.hidden_size))
+        h_in, *state = self.lstm_rnn(h_in)
+        for i in range(self.n_layers):
+            h_in, state = self.lstm[i](h_in, state)
+        return self.out(h_in)
+class MyGaussianLSTM(Model):
+    def __init__(self, input_shape, hidden_size, output_size, n_layers):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.n_layers = n_layers
+        self.embed = Dense(
+            hidden_size,
+            input_dim=input_shape,
+            kernel_initializer=initializer_conv_dense,
+        )
+        # self.lstm = Sequential(
+        #     [LSTMCell(hidden_size) for _ in range(n_layers)], name="lstm"
+        # )
+        self.lstm = [
+            LSTMCell(
+                hidden_size  # , return_sequences=False if i == self.n_layers - 1 else True
+            )
+            for i in range(self.n_layers)
+        ]  # LSTMCell(hidden_size)
+        self.lstm_rnn = tf.keras.layers.RNN(self.lstm[0], return_state=True)
+        self.mu_net = Dense(output_size, kernel_initializer=initializer_conv_dense)
+        self.logvar_net = Dense(output_size, kernel_initializer=initializer_conv_dense)
+        # self.out = Sequential(
+        #     [
+        #         tf.keras.layers.Dense(
+        #             tfp.layers.MultivariateNormalTriL.params_size(output_size),
+        #             activation=None,
+        #         ),
+        #         tfp.layers.MultivariateNormalTriL(output_size),
+        #     ]
+        # )
+    def reparameterize(self, mu, logvar: tf.Tensor):
+        logvar = tf.math.exp(logvar * 0.5)
+        eps = tf.random.normal(logvar.shape)
+        return tf.add(tf.math.multiply(eps, logvar), mu)
+    def init_hidden(self, batch_size):
+        hidden = []
+        for i in range(self.n_layers):
+            hidden.append(
+                (
+                    tf.Variable(tf.zeros([batch_size, self.hidden_size])),
+                    tf.Variable(tf.zeros([batch_size, self.hidden_size])),
+                )
+            )
+        self.__dict__["hidden"] = hidden
+    def build(self, input_shape):
+        self.init_hidden(input_shape[0])
+    def call(self, inputs):
+        h_in = self.embed(inputs)
+        # for i in range(self.n_layers):
+        #     # print(h_in.shape, self.hidden[i][0].shape, self.hidden[i][0].shape)
+        #     _, self.hidden[i] = self.lstm(h_in, self.hidden[i])
+        #     h_in = self.hidden[i][0]
+        h_in = tf.reshape(h_in, (-1, 1, self.hidden_size))
+        h_in, *state = self.lstm_rnn(h_in)
+        for i in range(self.n_layers):
+            h_in, state = self.lstm[i](h_in, state)
+        mu = self.mu_net(h_in)
+        logvar = self.logvar_net(h_in)
+        z = self.reparameterize(mu, logvar)
+        return z, mu, logvar
+class P2P(Model):
+    def __init__(
+        self,
+        channels: int = 1,
+        g_dim: int = 128,
+        z_dim: int = 10,
+        rnn_size: int = 256,
+        prior_rnn_layers: int = 1,
+        posterior_rnn_layers: int = 1,
+        predictor_rnn_layers: float = 2,
+        skip_prob: float = 0.5,
+        n_past: int = 1,
+        last_frame_skip: bool = False,
+        beta: float = 0.0001,
+        weight_align: float = 0.1,
+        weight_cpc: float = 100,
+    ):
+        super().__init__()
+        self.channels = channels
+        self.g_dim = g_dim
+        self.z_dim = z_dim
+        self.rnn_size = rnn_size
+        self.prior_rnn_layers = prior_rnn_layers
+        self.posterior_rnn_layers = posterior_rnn_layers
+        self.predictor_rnn_layers = predictor_rnn_layers
+        self.skip_prob = skip_prob
+        self.n_past = n_past
+        self.last_frame_skip = last_frame_skip
+        self.beta = beta
+        self.weight_align = weight_align
+        self.weight_cpc = weight_cpc
+        self.frame_predictor = MyLSTM(
+            self.g_dim + self.z_dim + 1 + 1,
+            self.rnn_size,
+            self.g_dim,
+            self.predictor_rnn_layers,
+        )
+        self.prior = MyGaussianLSTM(
+            self.g_dim + self.g_dim + 1 + 1,
+            self.rnn_size,
+            self.z_dim,
+            self.prior_rnn_layers,
+        )
+        self.posterior = MyGaussianLSTM(
+            self.g_dim + self.g_dim + 1 + 1,
+            self.rnn_size,
+            self.z_dim,
+            self.posterior_rnn_layers,
+        )
+        self.encoder = Encoder(self.g_dim, self.channels)
+        self.decoder = Decoder(self.g_dim, self.channels)
+        # criterions
+        self.mse_criterion = tf.keras.losses.MeanSquaredError()
+        self.kl_criterion = KLCriterion()
+        self.align_criterion = tf.keras.losses.MeanSquaredError()
+        self.total_loss_tracker = tf.keras.metrics.Mean(name="total_loss")
+        self.reconstruction_loss_tracker = tf.keras.metrics.Mean(
+            name="reconstruction_loss"
+        )
+        self.kl_loss_tracker = tf.keras.metrics.Mean(name="kl_loss")
+        self.align_loss_tracker = tf.keras.metrics.Mean(name="align_loss")
+        self.cpc_loss_tracker = tf.keras.metrics.Mean(name="align_loss")
+        # optimizers
+        # self.frame_predictor_optimizer = tf.keras.optimizers.Adam(
+        #     learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-8
+        # )
+        # self.posterior_optimizer = tf.keras.optimizers.Adam(
+        #     learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-8
+        # )
+        # self.prior_optimizer = tf.keras.optimizers.Adam(
+        #     learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-8
+        # )
+        # self.encoder_optimizer = tf.keras.optimizers.Adam(
+        #     learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-8
+        # )
+        # self.decoder_optimizer = tf.keras.optimizers.Adam(
+        #     learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-8
+        # )
+    @property
+    def metrics(self):
+        return [
+            self.total_loss_tracker,
+            self.reconstruction_loss_tracker,
+            self.kl_loss_tracker,
+            self.align_loss_tracker,
+            self.cpc_loss_tracker,
+        ]
+    def get_global_descriptor(self, x, start_ix=0, cp_ix=None):
+        """Get the global descriptor based on x, start_ix, cp_ix."""
+        if cp_ix is None:
+            cp_ix = x.shape[1] - 1
+        x_cp = x[:, cp_ix, ...]
+        h_cp = self.encoder(x_cp)[0]  # 1 is input for skip-connection
+        return x_cp, h_cp
+    def compile(
+        self,
+        frame_predictor_optimizer,
+        prior_optimizer,
+        posterior_optimizer,
+        encoder_optimizer,
+        decoder_optimizer,
+    ):
+        super().compile()
+        self.frame_predictor_optimizer = frame_predictor_optimizer
+        self.prior_optimizer = prior_optimizer
+        self.posterior_optimizer = posterior_optimizer
+        self.encoder_optimizer = encoder_optimizer
+        self.decoder_optimizer = decoder_optimizer
+    def train_step(self, data):
+        y, x = data
+        batch_size = 100
+        mse_loss = 0
+        kld_loss = 0
+        cpc_loss = 0
+        align_loss = 0
+        seq_len = x.shape[1]
+        start_ix = 0
+        cp_ix = seq_len - 1
+        x_cp, global_z = self.get_global_descriptor(
+            x, start_ix, cp_ix
+        )  # here global_z is h_cp
+        skip_prob = self.skip_prob
+        prev_i = 0
+        max_skip_count = seq_len * skip_prob
+        skip_count = 0
+        probs = np.random.uniform(low=0, high=1, size=seq_len - 1)
+        with tf.GradientTape(persistent=True) as tape:
+            for i in tqdm(range(1, seq_len)):
+                if (
+                    probs[i - 1] <= skip_prob
+                    and i >= self.n_past
+                    and skip_count < max_skip_count
+                    and i != 1
+                    and i != cp_ix
+                ):
+                    skip_count += 1
+                    continue
+                if i > 1:
+                    align_loss += self.align_criterion(h, h_pred)
+                time_until_cp = tf.fill(
+                    [batch_size, 1],
+                    (cp_ix - i + 1) / cp_ix,
+                )
+                delta_time = tf.fill([batch_size, 1], ((i - prev_i) / cp_ix))
+                prev_i = i
+                h = self.encoder(x[:, i - 1, ...])
+                h_target = self.encoder(x[:, i, ...])[0]
+                if self.last_frame_skip or i <= self.n_past:
+                    h, skip = h
+                else:
+                    h = h[0]
+                # Control Point Aware
+                h_cpaw = tf.concat([h, global_z, time_until_cp, delta_time], axis=-1)
+                h_target_cpaw = tf.concat(
+                    [h_target, global_z, time_until_cp, delta_time], axis=-1
+                )
+                zt, mu, logvar = self.posterior(h_target_cpaw)
+                zt_p, mu_p, logvar_p = self.prior(h_cpaw)
+                frame_predictor_input = tf.concat(
+                    [h, zt, time_until_cp, delta_time], axis=-1
+                )
+                h_pred = self.frame_predictor(frame_predictor_input)
+                x_pred = self.decoder([h_pred, skip])
+                if i == cp_ix:  # the gen-cp-frame should be exactly as x_cp
+                    h_pred_p = self.frame_predictor(
+                        tf.concat([h, zt_p, time_until_cp, delta_time], axis=-1)
+                    )
+                    x_pred_p = self.decoder([h_pred_p, skip])
+                    cpc_loss = self.mse_criterion(x_pred_p, x_cp)
+                mse_loss += self.mse_criterion(x_pred, x[:, i, ...])
+                kld_loss += self.kl_criterion((mu, logvar), (mu_p, logvar_p))
+            # backward
+            loss = (
+                mse_loss
+                + kld_loss * self.beta
+                + align_loss * self.weight_align
+                # + cpc_loss * self.weight_cpc
+            )
+            prior_loss = kld_loss + cpc_loss * self.weight_cpc
+        var_list_frame_predictor = self.frame_predictor.trainable_variables
+        var_list_posterior = self.posterior.trainable_variables
+        var_list_prior = self.prior.trainable_variables
+        var_list_encoder = self.encoder.trainable_variables
+        var_list_decoder = self.decoder.trainable_variables
+        # mse: frame_predictor + decoder
+        # align: frame_predictor + encoder
+        # kld: posterior + prior + encoder
+        var_list = (
+            var_list_frame_predictor
+            + var_list_posterior
+            + var_list_encoder
+            + var_list_decoder
+            + var_list_prior
+        )
+        gradients = tape.gradient(
+            loss,
+            var_list,
+        )
+        gradients_prior = tape.gradient(
+            prior_loss,
+            var_list_prior,
+        )
+        self.update_model(
+            gradients,
+            var_list,
+        )
+        self.update_prior(gradients_prior, var_list_prior)
+        del tape
+        self.total_loss_tracker.update_state(loss)
+        self.kl_loss_tracker.update_state(kld_loss)
+        self.align_loss_tracker.update_state(align_loss)
+        self.reconstruction_loss_tracker.update_state(mse_loss)
+        self.cpc_loss_tracker.update_state(cpc_loss)
+        return {
+            "loss": self.total_loss_tracker.result(),
+            "reconstruction_loss": self.reconstruction_loss_tracker.result(),
+            "kl_loss": self.kl_loss_tracker.result(),
+            "align_loss": self.align_loss_tracker.result(),
+            "cpc_loss": self.cpc_loss_tracker.result(),
+        }
+    def call(
+        self,
+        inputs,
+        training=None,
+        mask=None
+        # len_output,
+        # eval_cp_ix,
+        # start_ix=0,
+        # cp_ix=-1,
+        # model_mode="full",
+        # skip_frame=False,
+        # init_hidden=True,
+    ):
+        len_output = 20
+        eval_cp_ix = len_output - 1
+        start_ix = 0
+        cp_ix = -1
+        model_mode = "full"
+        skip_frame = False
+        init_hidden = True
+        batch_size, num_frames, h, w, channels = inputs.shape
+        dim_shape = (h, w, channels)
+        gen_seq = [inputs[:, 0, ...]]
+        x_in = inputs[:, 0, ...]
+        seq_len = inputs.shape[1]
+        cp_ix = seq_len - 1
+        x_cp, global_z = self.get_global_descriptor(
+            inputs, cp_ix=cp_ix
+        )  # here global_z is h_cp
+        skip_prob = self.skip_prob
+        prev_i = 0
+        max_skip_count = seq_len * skip_prob
+        skip_count = 0
+        probs = np.random.uniform(0, 1, len_output - 1)
+        for i in range(1, len_output):
+            if (
+                probs[i - 1] <= skip_prob
+                and i >= self.n_past
+                and skip_count < max_skip_count
+                and i != 1
+                and i != (len_output - 1)
+                and skip_frame
+            ):
+                skip_count += 1
+                gen_seq.append(tf.zeros_like(x_in))
+                continue
+            time_until_cp = tf.fill([100, 1], (eval_cp_ix - i + 1) / eval_cp_ix)
+            delta_time = tf.fill([100, 1], ((i - prev_i) / eval_cp_ix))
+            prev_i = i
+            h = self.encoder(x_in)
+            if self.last_frame_skip or i == 1 or i < self.n_past:
+                h, skip = h
+            else:
+                h, _ = h
+            h_cpaw = tf.stop_gradient(tf.concat([h, global_z, time_until_cp, delta_time], axis=-1))
+            if i < self.n_past:
+                h_target = self.encoder(inputs[:, i, ...])[0]
+                h_target_cpaw = tf.stop_gradient(tf.concat(
+                    [h_target, global_z, time_until_cp, delta_time], axis=1
+                ))
+                zt, _, _ = self.posterior(h_target_cpaw)
+                zt_p, _, _ = self.prior(h_cpaw)
+                if model_mode == "posterior" or model_mode == "full":
+                    self.frame_predictor(
+                        tf.concat([h, zt, time_until_cp, delta_time], axis=-1)
+                    )
+                elif model_mode == "prior":
+                    self.frame_predictor(
+                        tf.concat([h, zt_p, time_until_cp, delta_time], axis=-1)
+                    )
+                x_in = inputs[:, i, ...]
+                gen_seq.append(x_in)
+            else:
+                if i < num_frames:
+                    h_target = self.encoder(inputs[:, i, ...])[0]
+                    h_target_cpaw = tf.stop_gradient(tf.concat(
+                        [h_target, global_z, time_until_cp, delta_time], axis=-1
+                    ))
+                else:
+                    h_target_cpaw = h_cpaw
+                zt, _, _ = self.posterior(h_target_cpaw)
+                zt_p, _, _ = self.prior(h_cpaw)
+                if model_mode == "posterior":
+                    h = self.frame_predictor(
+                        tf.concat([h, zt, time_until_cp, delta_time], axis=-1)
+                    )
+                elif model_mode == "prior" or model_mode == "full":
+                    h = self.frame_predictor(
+                        tf.concat([h, zt_p, time_until_cp, delta_time], axis=-1)
+                    )
+                x_in = tf.stop_gradient(self.decoder([h, skip]))
+                gen_seq.append(x_in)
+        return tf.stack(gen_seq, axis=1)
+    def update_model(self, gradients, var_list):
+        self.frame_predictor_optimizer.apply_gradients(zip(gradients, var_list))
+        self.posterior_optimizer.apply_gradients(zip(gradients, var_list))
+        self.encoder_optimizer.apply_gradients(zip(gradients, var_list))
+        self.decoder_optimizer.apply_gradients(zip(gradients, var_list))
+        #self.prior_optimizer.apply_gradients(zip(gradients, var_list))
+    def update_prior(self, gradients, var_list):
+        self.prior_optimizer.apply_gradients(zip(gradients, var_list))
+    # def update_model_without_prior(self):
+    #     self.frame_predictor_optimizer.step()
+    #     self.posterior_optimizer.step()
+    #     self.encoder_optimizer.step()
+    #     self.decoder_optimizer.step()

ganime/model/p2p/p2p_v2.py ADDED Viewed

	@@ -0,0 +1,498 @@

+import numpy as np
+import tensorflow as tf
+from tensorflow.keras import Model, Sequential
+from tensorflow.keras.layers import (
+    LSTM,
+    Activation,
+    BatchNormalization,
+    Conv2D,
+    Conv2DTranspose,
+    Conv3D,
+    Conv3DTranspose,
+    Dense,
+    Flatten,
+    Input,
+    Layer,
+    LeakyReLU,
+    MaxPooling2D,
+    Reshape,
+    TimeDistributed,
+    UpSampling2D,
+)
+from tensorflow.keras.losses import Loss
+from tensorflow.keras.losses import KLDivergence, MeanSquaredError
+from tqdm.auto import tqdm
+class KLCriterion(Loss):
+    def call(self, y_true, y_pred):
+        (mu1, logvar1), (mu2, logvar2) = y_true, y_pred
+        """KL( N(mu_1, sigma2_1) || N(mu_2, sigma2_2))"""
+        sigma1 = tf.exp(tf.math.multiply(logvar1, 0.5))
+        sigma2 = tf.exp(tf.math.multiply(logvar2, 0.5))
+        kld = (
+            tf.math.log(sigma2 / sigma1)
+            + (tf.exp(logvar1) + tf.square(mu1 - mu2)) / (2 * tf.exp(logvar2))
+            - 0.5
+        )
+        return kld
+class Decoder(Model):
+    def __init__(self, dim, nc=1):
+        super().__init__()
+        self.dim = dim
+        self.upc1 = Sequential(
+            [
+                TimeDistributed(
+                    Conv2DTranspose(512, kernel_size=4, strides=1, padding="valid")
+                ),
+                BatchNormalization(),
+                LeakyReLU(alpha=0.2),
+            ]
+        )
+        self.upc2 = Sequential(
+            [
+                TimeDistributed(
+                    Conv2DTranspose(256, kernel_size=4, strides=2, padding="same")
+                ),
+                BatchNormalization(),
+                LeakyReLU(alpha=0.2),
+            ]
+        )
+        self.upc3 = Sequential(
+            [
+                TimeDistributed(
+                    Conv2DTranspose(128, kernel_size=4, strides=2, padding="same")
+                ),
+                BatchNormalization(),
+                LeakyReLU(alpha=0.2),
+            ]
+        )
+        self.upc4 = Sequential(
+            [
+                TimeDistributed(
+                    Conv2DTranspose(64, kernel_size=4, strides=2, padding="same")
+                ),
+                BatchNormalization(),
+                LeakyReLU(alpha=0.2),
+            ]
+        )
+        self.upc5 = Sequential(
+            [
+                TimeDistributed(
+                    Conv2DTranspose(1, kernel_size=4, strides=2, padding="same")
+                ),
+                Activation("sigmoid"),
+            ]
+        )
+    def call(self, input):
+        vec, skip = input
+        d1 = self.upc1(tf.reshape(vec, (-1, 1, 1, 1, self.dim)))
+        d2 = self.upc2(tf.concat([d1, skip[3]], axis=-1))
+        d3 = self.upc3(tf.concat([d2, skip[2]], axis=-1))
+        d4 = self.upc4(tf.concat([d3, skip[1]], axis=-1))
+        output = self.upc5(tf.concat([d4, skip[0]], axis=-1))
+        return output
+class Sampling(Layer):
+    """Uses (z_mean, z_log_var) to sample z, the vector encoding a digit."""
+    def call(self, inputs):
+        z_mean, z_log_var = inputs
+        batch = tf.shape(z_mean)[0]
+        dim = tf.shape(z_mean)[1]
+        epsilon = tf.keras.backend.random_normal(shape=(batch, dim))
+        return z_mean + tf.exp(0.5 * z_log_var) * epsilon
+    def compute_output_shape(self, input_shape):
+        return input_shape[0]
+class P2P(Model):
+    def __init__(
+        self,
+        channels: int = 1,
+        g_dim: int = 128,
+        z_dim: int = 10,
+        rnn_size: int = 256,
+        prior_rnn_layers: int = 1,
+        posterior_rnn_layers: int = 1,
+        predictor_rnn_layers: float = 1,
+        skip_prob: float = 0.1,
+        n_past: int = 1,
+        last_frame_skip: bool = False,
+        beta: float = 0.0001,
+        weight_align: float = 0.1,
+        weight_cpc: float = 100,
+    ):
+        super().__init__()
+        # Models parameters
+        self.channels = channels
+        self.g_dim = g_dim
+        self.z_dim = z_dim
+        self.rnn_size = rnn_size
+        self.prior_rnn_layers = prior_rnn_layers
+        self.posterior_rnn_layers = posterior_rnn_layers
+        self.predictor_rnn_layers = predictor_rnn_layers
+        # Training parameters
+        self.skip_prob = skip_prob
+        self.n_past = n_past
+        self.last_frame_skip = last_frame_skip
+        self.beta = beta
+        self.weight_align = weight_align
+        self.weight_cpc = weight_cpc
+        self.frame_predictor = self.build_lstm()
+        self.prior = self.build_gaussian_lstm()
+        self.posterior = self.build_gaussian_lstm()
+        self.encoder = self.build_encoder()
+        self.decoder = self.build_decoder()
+        self.total_loss_tracker = tf.keras.metrics.Mean(name="total_loss")
+        self.reconstruction_loss_tracker = tf.keras.metrics.Mean(
+            name="reconstruction_loss"
+        )
+        self.kl_loss_tracker = tf.keras.metrics.Mean(name="kl_loss")
+        self.align_loss_tracker = tf.keras.metrics.Mean(name="align_loss")
+        self.cpc_loss_tracker = tf.keras.metrics.Mean(name="align_loss")
+        self.kl_loss = KLCriterion(
+            reduction=tf.keras.losses.Reduction.NONE
+        )  # KLDivergence(reduction=tf.keras.losses.Reduction.NONE)
+        self.mse = MeanSquaredError(reduction=tf.keras.losses.Reduction.NONE)
+        self.align_loss = MeanSquaredError(reduction=tf.keras.losses.Reduction.NONE)
+        # self.optimizer = tf.keras.optimizers.Adam(
+        #     learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-8
+        # )
+        # self.prior_optimizer = tf.keras.optimizers.Adam(
+        #     learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-8
+        # )
+    # region Model building
+    def build_lstm(self):
+        input = Input(shape=(None, self.g_dim + self.z_dim))
+        embed = TimeDistributed(Dense(self.rnn_size))(input)
+        lstm = LSTM(self.rnn_size)(embed)
+        output = Dense(self.g_dim)(lstm)
+        output = (tf.expand_dims(output, axis=1),)
+        return Model(inputs=input, outputs=output, name="frame_predictor")
+    def build_gaussian_lstm(self):
+        input = Input(shape=(None, self.g_dim))
+        embed = TimeDistributed(Dense(self.rnn_size))(input)
+        lstm = LSTM(self.rnn_size)(embed)
+        mu = Dense(self.z_dim)(lstm)
+        logvar = Dense(self.z_dim)(lstm)
+        z = Sampling()([mu, logvar])
+        return Model(inputs=input, outputs=[mu, logvar, z])
+    def build_encoder(self):
+        input = Input(shape=(1, 64, 64, 1))
+        h = TimeDistributed(Conv2D(64, kernel_size=4, strides=2, padding="same"))(input)
+        h = BatchNormalization()(h)
+        h1 = LeakyReLU(alpha=0.2)(h)
+        # h = TimeDistributed(MaxPooling2D(pool_size=2, strides=2, padding="same"))(h)
+        h = TimeDistributed(Conv2D(128, kernel_size=4, strides=2, padding="same"))(h1)
+        h = BatchNormalization()(h)
+        h2 = LeakyReLU(alpha=0.2)(h)
+        # h = TimeDistributed(MaxPooling2D(pool_size=2, strides=2, padding="same"))(h)
+        h = TimeDistributed(Conv2D(256, kernel_size=4, strides=2, padding="same"))(h2)
+        h = BatchNormalization()(h)
+        h3 = LeakyReLU(alpha=0.2)(h)
+        # h = TimeDistributed(MaxPooling2D(pool_size=2, strides=2, padding="same"))(h)
+        h = TimeDistributed(Conv2D(512, kernel_size=4, strides=2, padding="same"))(h3)
+        h = BatchNormalization()(h)
+        h4 = LeakyReLU(alpha=0.2)(h)
+        # h = TimeDistributed(MaxPooling2D(pool_size=2, strides=2, padding="same"))(h)
+        h = TimeDistributed(
+            Conv2D(self.g_dim, kernel_size=4, strides=1, padding="valid")
+        )(h4)
+        h = BatchNormalization()(h)
+        h5 = Activation("tanh")(h)
+        output = tf.reshape(h5, (-1, 1, self.g_dim))
+        # h = Flatten()(h)
+        # output = Dense(self.g_dim)(h)
+        # output = tf.expand_dims(output, axis=1)
+        return Model(inputs=input, outputs=[output, [h1, h2, h3, h4]], name="encoder")
+    def build_decoder(self):
+        return Decoder(self.g_dim)
+    # def build_decoder(self):
+    #     latent_inputs = Input(
+    #         shape=(
+    #             1,
+    #             self.g_dim,
+    #         )
+    #     )
+    #     x = Dense(1 * 1 * 1 * 128, activation="relu")(latent_inputs)
+    #     x = Reshape((1, 1, 1, 128))(x)
+    #     x = TimeDistributed(
+    #         Conv2DTranspose(512, kernel_size=4, strides=1, padding="valid")
+    #     )(x)
+    #     x = BatchNormalization()(x)
+    #     x1 = LeakyReLU(alpha=0.2)(x)
+    #     x = TimeDistributed(
+    #         Conv2DTranspose(256, kernel_size=4, strides=2, padding="same")
+    #     )(x1)
+    #     x = BatchNormalization()(x)
+    #     x2 = LeakyReLU(alpha=0.2)(x)
+    #     x = TimeDistributed(
+    #         Conv2DTranspose(128, kernel_size=4, strides=2, padding="same")
+    #     )(x2)
+    #     x = BatchNormalization()(x)
+    #     x3 = LeakyReLU(alpha=0.2)(x)
+    #     x = TimeDistributed(
+    #         Conv2DTranspose(64, kernel_size=4, strides=2, padding="same")
+    #     )(x3)
+    #     x = BatchNormalization()(x)
+    #     x4 = LeakyReLU(alpha=0.2)(x)
+    #     x = TimeDistributed(
+    #         Conv2DTranspose(1, kernel_size=4, strides=2, padding="same")
+    #     )(x4)
+    #     x5 = Activation("sigmoid")(x)
+    #     return Model(inputs=latent_inputs, outputs=x5, name="decoder")
+    # endregion
+    @property
+    def metrics(self):
+        return [
+            self.total_loss_tracker,
+            self.reconstruction_loss_tracker,
+            self.kl_loss_tracker,
+            self.align_loss_tracker,
+            self.cpc_loss_tracker,
+        ]
+    def call(self, inputs, training=None, mask=None):
+        first_frame = inputs[:, 0:1, ...]
+        last_frame = inputs[:, -1:, ...]
+        desired_length = 20
+        previous_frame = first_frame
+        generated = [first_frame]
+        z_last, _ = self.encoder(last_frame)
+        for i in range(1, desired_length):
+            z_prev = self.encoder(previous_frame)
+            if self.last_frame_skip or i == 1 or i < self.n_past:
+                z_prev, skip = z_prev
+            else:
+                z_prev = z_prev[0]
+            prior_input = tf.concat([z_prev, z_last], axis=1)
+            z_mean_prior, z_log_var_prior, z_prior = self.prior(prior_input)
+            predictor_input = tf.concat(
+                (z_prev, tf.expand_dims(z_prior, axis=1)), axis=-1
+            )
+            z_pred = self.frame_predictor(predictor_input)
+            current_frame = self.decoder([z_pred, skip])
+            generated.append(current_frame)
+            previous_frame = current_frame
+        return tf.concat(generated, axis=1)
+    def train_step(self, data):
+        global_batch_size = 100  # * 8
+        x, y = data
+        first_frame = x[:, 0:1, ...]
+        last_frame = x[:, -1:, ...]
+        desired_length = y.shape[1]
+        previous_frame = first_frame
+        reconstruction_loss = 0
+        kl_loss = 0
+        align_loss = 0
+        cpc_loss = 0
+        with tf.GradientTape(persistent=True) as tape:
+            z_last, _ = self.encoder(last_frame)
+            for i in tqdm(range(1, desired_length)):
+                current_frame = y[:, i : i + 1, ...]
+                z_prev = self.encoder(previous_frame)
+                if self.last_frame_skip or i <= self.n_past:
+                    z_prev, skip = z_prev
+                else:
+                    z_prev = z_prev[0]
+                z_curr, _ = self.encoder(current_frame)
+                prior_input = tf.concat([z_prev, z_last], axis=1)
+                posterior_input = tf.concat([z_curr, z_last], axis=1)
+                z_mean_prior, z_log_var_prior, z_prior = self.prior(prior_input)
+                z_mean_posterior, z_log_var_posterior, z_posterior = self.posterior(
+                    posterior_input
+                )
+                # predictor_input = z_prev
+                predictor_input = tf.concat(
+                    (z_prev, tf.expand_dims(z_posterior, axis=1)), axis=-1
+                )
+                z_pred = self.frame_predictor(predictor_input)
+                kl_loss += tf.reduce_sum(
+                    self.kl_loss(
+                        (z_mean_prior, z_log_var_prior),
+                        (z_mean_posterior, z_log_var_posterior),
+                    )
+                ) * (1.0 / global_batch_size)
+                if i > 1:
+                    align_loss += tf.reduce_sum(self.align_loss(z_pred, z_curr)) * (
+                        1.0 / global_batch_size
+                    )
+                if i == desired_length - 1:
+                    h_pred_p = self.frame_predictor(
+                        tf.concat([z_prev, tf.expand_dims(z_prior, axis=1)], axis=-1)
+                    )
+                    x_pred_p = self.decoder([h_pred_p, skip])
+                    cpc_loss = tf.reduce_sum(self.mse(x_pred_p, current_frame)) * (
+                        1.0 / global_batch_size
+                    )
+                prediction = self.decoder([z_pred, skip])
+                reconstruction_loss += tf.reduce_sum(
+                    self.mse(prediction, current_frame)
+                ) * (1.0 / global_batch_size)
+                previous_frame = current_frame
+            loss = (
+                reconstruction_loss
+                + kl_loss * self.beta
+                + align_loss * self.weight_align
+                + cpc_loss * self.weight_cpc
+            )
+            prior_loss = kl_loss + cpc_loss * self.weight_cpc
+        grads_without_prior = tape.gradient(
+            loss,
+            (
+                self.encoder.trainable_weights
+                + self.decoder.trainable_weights
+                + self.posterior.trainable_weights
+                + self.frame_predictor.trainable_weights
+            ),
+        )
+        self.optimizer.apply_gradients(
+            zip(
+                grads_without_prior,
+                (
+                    self.encoder.trainable_weights
+                    + self.decoder.trainable_weights
+                    + self.posterior.trainable_weights
+                    + self.frame_predictor.trainable_weights
+                ),
+            )
+        )
+        grads_prior = tape.gradient(
+            prior_loss,
+            self.prior.trainable_weights,
+        )
+        self.optimizer.apply_gradients(
+            zip(
+                grads_prior,
+                self.prior.trainable_weights,
+            )
+        )
+        del tape
+        self.total_loss_tracker.update_state(loss)
+        self.kl_loss_tracker.update_state(kl_loss)
+        self.align_loss_tracker.update_state(align_loss)
+        self.reconstruction_loss_tracker.update_state(reconstruction_loss)
+        self.cpc_loss_tracker.update_state(cpc_loss)
+        return {
+            "loss": self.total_loss_tracker.result(),
+            "reconstruction_loss": self.reconstruction_loss_tracker.result(),
+            "kl_loss": self.kl_loss_tracker.result(),
+            "align_loss": self.align_loss_tracker.result(),
+            "cpc_loss": self.cpc_loss_tracker.result(),
+        }
+        # print("KL_LOSS")
+        # print(kl_loss)
+        # print("ALIGN_LOSS")
+        # print(align_loss)
+        # print("RECONSTRUCTION_LOSS")
+        # print(reconstruction_loss)
+        # with tf.GradientTape() as tape:
+        #     z_mean, z_log_var, z = self.encoder(x)
+        #     reconstruction = self.decoder(z)
+        #     reconstruction_loss = tf.reduce_mean(
+        #         tf.reduce_sum(
+        #             tf.keras.losses.binary_crossentropy(y, reconstruction),
+        #             axis=(1, 2),
+        #         )
+        #     )
+        #     kl_loss = -0.5 * (1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var))
+        #     kl_loss = tf.reduce_mean(tf.reduce_sum(kl_loss, axis=1))
+        #     total_loss = reconstruction_loss + self.kl_beta * kl_loss
+        # grads = tape.gradient(total_loss, self.trainable_weights)
+        # self.optimizer.apply_gradients(zip(grads, self.trainable_weights))
+        # self.total_loss_tracker.update_state(total_loss)
+        # self.reconstruction_loss_tracker.update_state(reconstruction_loss)
+        # self.kl_loss_tracker.update_state(kl_loss)
+        # return {
+        #     "loss": self.total_loss_tracker.result(),
+        #     "reconstruction_loss": self.reconstruction_loss_tracker.result(),
+        #     "kl_loss": self.kl_loss_tracker.result(),
+        # }
+    # def test_step(self, data):
+    #     if isinstance(data, tuple):
+    #         data = data[0]
+    #     z_mean, z_log_var, z = self.encoder(data)
+    #     reconstruction = self.decoder(z)
+    #     reconstruction_loss = tf.reduce_mean(
+    #         tf.keras.losses.binary_crossentropy(data, reconstruction)
+    #     )
+    #     reconstruction_loss *= 28 * 28
+    #     kl_loss = 1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var)
+    #     kl_loss = tf.reduce_mean(kl_loss)
+    #     kl_loss *= -0.5
+    #     total_loss = reconstruction_loss + kl_loss
+    #     return {
+    #         "loss": total_loss,
+    #         "reconstruction_loss": reconstruction_loss,
+    #         "kl_loss": kl_loss,
+    #     }

ganime/model/p2p/p2p_v3.py ADDED Viewed

	@@ -0,0 +1,237 @@

+import numpy as np
+import tensorflow as tf
+from tensorflow.keras import Model, Sequential
+from tensorflow.keras.layers import (
+    LSTM,
+    Activation,
+    BatchNormalization,
+    Conv2D,
+    Conv2DTranspose,
+    Conv3D,
+    Conv3DTranspose,
+    Dense,
+    Flatten,
+    Input,
+    Layer,
+    LeakyReLU,
+    MaxPooling2D,
+    Reshape,
+    TimeDistributed,
+    UpSampling2D,
+)
+SEQ_LEN = 20
+class Sampling(Layer):
+    """Uses (z_mean, z_log_var) to sample z, the vector encoding a digit."""
+    def call(self, inputs):
+        z_mean, z_log_var = inputs
+        batch = tf.shape(z_mean)[0]
+        dim = tf.shape(z_mean)[1]
+        epsilon = tf.keras.backend.random_normal(shape=(batch, dim))
+        return z_mean + tf.exp(0.5 * z_log_var) * epsilon
+    def compute_output_shape(self, input_shape):
+        return input_shape[0]
+class P2P(Model):
+    def __init__(
+        self,
+        channels: int = 1,
+        g_dim: int = 128,
+        z_dim: int = 10,
+        rnn_size: int = 256,
+        prior_rnn_layers: int = 1,
+        posterior_rnn_layers: int = 1,
+        predictor_rnn_layers: float = 1,
+        skip_prob: float = 0.1,
+        n_past: int = 1,
+        last_frame_skip: bool = False,
+        beta: float = 0.0001,
+        weight_align: float = 0.1,
+        weight_cpc: float = 100,
+    ):
+        super().__init__()
+        # Models parameters
+        self.channels = channels
+        self.g_dim = g_dim
+        self.z_dim = z_dim
+        self.rnn_size = rnn_size
+        self.prior_rnn_layers = prior_rnn_layers
+        self.posterior_rnn_layers = posterior_rnn_layers
+        self.predictor_rnn_layers = predictor_rnn_layers
+        # Training parameters
+        self.skip_prob = skip_prob
+        self.n_past = n_past
+        self.last_frame_skip = last_frame_skip
+        self.beta = beta
+        self.weight_align = weight_align
+        self.weight_cpc = weight_cpc
+        self.frame_predictor = self.build_lstm()
+        self.prior = self.build_gaussian_lstm()
+        self.posterior = self.build_gaussian_lstm()
+        self.encoder = self.build_encoder()
+        self.decoder = self.build_decoder()
+        self.total_loss_tracker = tf.keras.metrics.Mean(name="total_loss")
+        self.reconstruction_loss_tracker = tf.keras.metrics.Mean(
+            name="reconstruction_loss"
+        )
+        self.kl_loss_tracker = tf.keras.metrics.Mean(name="kl_loss")
+    # region Model building
+    def build_lstm(self):
+        input = Input(shape=(20, self.g_dim + self.z_dim + 1))
+        embed = TimeDistributed(Dense(self.rnn_size))(input)
+        lstm = LSTM(self.rnn_size, return_sequences=True)(embed)
+        output = TimeDistributed(Dense(self.g_dim))(lstm)
+        return Model(inputs=input, outputs=output, name="frame_predictor")
+    def build_gaussian_lstm(self):
+        input = Input(shape=(20, self.g_dim))
+        embed = TimeDistributed(Dense(self.rnn_size))(input)
+        lstm = LSTM(self.rnn_size, return_sequences=True)(embed)
+        mu = TimeDistributed(Dense(self.z_dim))(lstm)
+        logvar = TimeDistributed(Dense(self.z_dim))(lstm)
+        z = TimeDistributed(Sampling())([mu, logvar])
+        return Model(inputs=input, outputs=[mu, logvar, z])
+    def build_encoder(self):
+        input = Input(shape=(2, 64, 64, 1))
+        h = TimeDistributed(Conv2D(64, kernel_size=4, strides=2, padding="same"))(input)
+        h = BatchNormalization()(h)
+        h = LeakyReLU(alpha=0.2)(h)
+        # h = TimeDistributed(MaxPooling2D(pool_size=2, strides=2, padding="same"))(h)
+        h = TimeDistributed(Conv2D(128, kernel_size=4, strides=2, padding="same"))(h)
+        h = BatchNormalization()(h)
+        h = LeakyReLU(alpha=0.2)(h)
+        # h = TimeDistributed(MaxPooling2D(pool_size=2, strides=2, padding="same"))(h)
+        h = TimeDistributed(Conv2D(256, kernel_size=4, strides=2, padding="same"))(h)
+        h = BatchNormalization()(h)
+        h = LeakyReLU(alpha=0.2)(h)
+        # h = TimeDistributed(MaxPooling2D(pool_size=2, strides=2, padding="same"))(h)
+        h = TimeDistributed(Conv2D(512, kernel_size=4, strides=2, padding="same"))(h)
+        h = BatchNormalization()(h)
+        h = LeakyReLU(alpha=0.2)(h)
+        # h = TimeDistributed(MaxPooling2D(pool_size=2, strides=2, padding="same"))(h)
+        h = Flatten()(h)
+        # mu = Dense(self.g_dim)(h)
+        # logvar = Dense(self.g_dim)(h)
+        # z = Sampling()([mu, logvar])
+        lstm_input = Dense(self.g_dim * SEQ_LEN)(h)
+        lstm_input = Reshape((SEQ_LEN, self.g_dim))(lstm_input)
+        mu, logvar, z = self.posterior(lstm_input)
+        return Model(inputs=input, outputs=[mu, logvar, z], name="encoder")
+    def build_decoder(self):
+        latent_inputs = Input(shape=(SEQ_LEN, self.z_dim))
+        x = Dense(1 * 1 * 1 * 512, activation="relu")(latent_inputs)
+        x = Reshape((SEQ_LEN, 1, 1, 512))(x)
+        x = TimeDistributed(
+            Conv2DTranspose(512, kernel_size=4, strides=1, padding="valid")
+        )(x)
+        x = BatchNormalization()(x)
+        x = LeakyReLU(alpha=0.2)(x)
+        x = TimeDistributed(
+            Conv2DTranspose(256, kernel_size=4, strides=2, padding="same")
+        )(x)
+        x = BatchNormalization()(x)
+        x = LeakyReLU(alpha=0.2)(x)
+        x = TimeDistributed(
+            Conv2DTranspose(128, kernel_size=4, strides=2, padding="same")
+        )(x)
+        x = BatchNormalization()(x)
+        x = LeakyReLU(alpha=0.2)(x)
+        x = TimeDistributed(
+            Conv2DTranspose(64, kernel_size=4, strides=2, padding="same")
+        )(x)
+        x = BatchNormalization()(x)
+        x = LeakyReLU(alpha=0.2)(x)
+        x = TimeDistributed(
+            Conv2DTranspose(1, kernel_size=4, strides=2, padding="same")
+        )(x)
+        x = Activation("sigmoid")(x)
+        return Model(inputs=latent_inputs, outputs=x, name="decoder")
+    # endregion
+    @property
+    def metrics(self):
+        return [
+            self.total_loss_tracker,
+            self.reconstruction_loss_tracker,
+            self.kl_loss_tracker,
+        ]
+    def call(self, inputs, training=None, mask=None):
+        z_mean, z_log_var, z = self.encoder(inputs)
+        pred = self.decoder(z)
+        return pred
+    def train_step(self, data):
+        x, y = data
+        with tf.GradientTape() as tape:
+            z_mean, z_log_var, z = self.encoder(x)
+            reconstruction = self.decoder(z)
+            reconstruction_loss = tf.reduce_mean(
+                tf.reduce_sum(
+                    tf.keras.losses.binary_crossentropy(y, reconstruction),
+                    axis=(1, 2),
+                )
+            )
+            kl_loss = -0.5 * (1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var))
+            kl_loss = tf.reduce_mean(tf.reduce_sum(kl_loss, axis=1))
+            total_loss = reconstruction_loss + self.beta * kl_loss
+        grads = tape.gradient(total_loss, self.trainable_weights)
+        self.optimizer.apply_gradients(zip(grads, self.trainable_weights))
+        self.total_loss_tracker.update_state(total_loss)
+        self.reconstruction_loss_tracker.update_state(reconstruction_loss)
+        self.kl_loss_tracker.update_state(kl_loss)
+        return {
+            "loss": self.total_loss_tracker.result(),
+            "reconstruction_loss": self.reconstruction_loss_tracker.result(),
+            "kl_loss": self.kl_loss_tracker.result(),
+        }
+    def test_step(self, data):
+        if isinstance(data, tuple):
+            data = data[0]
+        z_mean, z_log_var, z = self.encoder(data)
+        reconstruction = self.decoder(z)
+        reconstruction_loss = tf.reduce_mean(
+            tf.keras.losses.binary_crossentropy(data, reconstruction)
+        )
+        reconstruction_loss *= 28 * 28
+        kl_loss = 1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var)
+        kl_loss = tf.reduce_mean(kl_loss)
+        kl_loss *= -0.5
+        total_loss = reconstruction_loss + kl_loss
+        return {
+            "loss": total_loss,
+            "reconstruction_loss": reconstruction_loss,
+            "kl_loss": kl_loss,
+        }

ganime/model/vae/vae.py ADDED Viewed

	@@ -0,0 +1,98 @@

+import numpy as np
+import matplotlib.pyplot as plt
+from tensorflow import keras
+from tensorflow.keras import layers
+import tensorflow as tf
+input_shape = (20, 64, 64, 1)
+class Sampling(keras.layers.Layer):
+    """Uses (z_mean, z_log_var) to sample z, the vector encoding a digit."""
+    def call(self, inputs):
+        z_mean, z_log_var = inputs
+        batch = tf.shape(z_mean)[0]
+        dim = z_mean.shape[1:]
+        epsilon = tf.keras.backend.random_normal(shape=(batch, *dim))
+        return z_mean + tf.exp(0.5 * z_log_var) * epsilon
+    def compute_output_shape(self, input_shape):
+        return input_shape[0]
+class VAE(keras.Model):
+    def __init__(self, latent_dim:int=32, num_embeddings:int=128, beta:float = 0.5, **kwargs):
+        super().__init__(**kwargs)
+        self.latent_dim = latent_dim
+        self.num_embeddings = num_embeddings
+        self.beta = beta
+        self.encoder = self.get_encoder()
+        self.decoder = self.get_decoder()
+        self.total_loss_tracker = tf.keras.metrics.Mean(name="total_loss")
+        self.reconstruction_loss_tracker = tf.keras.metrics.Mean(
+            name="reconstruction_loss"
+        )
+        self.kl_loss_tracker = tf.keras.metrics.Mean(name="kl_loss")
+    def get_encoder(self):
+        encoder_inputs = keras.Input(shape=input_shape)
+        x = layers.TimeDistributed(layers.Conv2D(32, 3, activation="relu", strides=2, padding="same"))(
+            encoder_inputs
+        )
+        x = layers.TimeDistributed(layers.Conv2D(64, 3, activation="relu", strides=2, padding="same"))(x)
+        x = layers.TimeDistributed(layers.Conv2D(self.latent_dim, 1, padding="same"))(x)
+        x = layers.TimeDistributed(layers.Flatten())(x)
+        mu = layers.TimeDistributed(layers.Dense(self.num_embeddings))(x)
+        logvar = layers.TimeDistributed(layers.Dense(self.num_embeddings))(x)
+        z = Sampling()([mu, logvar])
+        return keras.Model(encoder_inputs, [mu, logvar, z], name="encoder")
+    def get_decoder(self):
+        latent_inputs = keras.Input(shape=self.encoder.output[2].shape[1:])
+        x = layers.TimeDistributed(layers.Dense(16 * 16 * 32, activation="relu"))(latent_inputs)
+        x = layers.TimeDistributed(layers.Reshape((16, 16, 32)))(x)
+        x = layers.TimeDistributed(layers.Conv2DTranspose(64, 3, activation="relu", strides=2, padding="same"))(
+            x
+        )
+        x = layers.TimeDistributed(layers.Conv2DTranspose(32, 3, activation="relu", strides=2, padding="same"))(x)
+        decoder_outputs = layers.TimeDistributed(layers.Conv2DTranspose(1, 3, padding="same"))(x)
+        return keras.Model(latent_inputs, decoder_outputs, name="decoder")
+    def train_step(self, data):
+        x, y = data
+        with tf.GradientTape() as tape:
+            mu, logvar, z = self.encoder(x)
+            reconstruction = self.decoder(z)
+            reconstruction_loss = tf.reduce_mean(
+                tf.reduce_sum(
+                    tf.keras.losses.binary_crossentropy(y, reconstruction),
+                    axis=(1, 2),
+                )
+            )
+            kl_loss = -0.5 * (1 + logvar - tf.square(mu) - tf.exp(logvar))
+            kl_loss = tf.reduce_mean(tf.reduce_sum(kl_loss, axis=1))
+            total_loss = reconstruction_loss + self.beta * kl_loss
+        grads = tape.gradient(total_loss, self.trainable_weights)
+        self.optimizer.apply_gradients(zip(grads, self.trainable_weights))
+        self.total_loss_tracker.update_state(total_loss)
+        self.reconstruction_loss_tracker.update_state(reconstruction_loss)
+        self.kl_loss_tracker.update_state(kl_loss)
+        return {
+            "loss": self.total_loss_tracker.result(),
+            "reconstruction_loss": self.reconstruction_loss_tracker.result(),
+            "kl_loss": self.kl_loss_tracker.result(),
+        }
+    def call(self, inputs, training=False, mask=None):
+        z_mean, z_log_var, z = self.encoder(inputs)
+        pred = self.decoder(z)
+        return pred

ganime/model/vq_vae/vq_vae.py ADDED Viewed

	@@ -0,0 +1,143 @@

+import numpy as np
+import matplotlib.pyplot as plt
+from tensorflow import keras
+from tensorflow.keras import layers
+import tensorflow as tf
+input_shape = (20, 64, 64, 1)
+class VectorQuantizer(layers.Layer):
+    def __init__(self, num_embeddings, embedding_dim, beta=0.25, **kwargs):
+        super().__init__(**kwargs)
+        self.embedding_dim = embedding_dim
+        self.num_embeddings = num_embeddings
+        self.beta = (
+            beta  # This parameter is best kept between [0.25, 2] as per the paper.
+        )
+        # Initialize the embeddings which we will quantize.
+        w_init = tf.random_uniform_initializer()
+        self.embeddings = tf.Variable(
+            initial_value=w_init(
+                shape=(self.embedding_dim, self.num_embeddings), dtype="float32"
+            ),
+            trainable=True,
+            name="embeddings_vqvae",
+        )
+    def call(self, x):
+        # Calculate the input shape of the inputs and
+        # then flatten the inputs keeping `embedding_dim` intact.
+        input_shape = tf.shape(x)
+        flattened = tf.reshape(x, [-1, self.embedding_dim])
+        # Quantization.
+        encoding_indices = self.get_code_indices(flattened)
+        encodings = tf.one_hot(encoding_indices, self.num_embeddings)
+        quantized = tf.matmul(encodings, self.embeddings, transpose_b=True)
+        quantized = tf.reshape(quantized, input_shape)
+        # Calculate vector quantization loss and add that to the layer. You can learn more
+        # about adding losses to different layers here:
+        # https://keras.io/guides/making_new_layers_and_models_via_subclassing/. Check
+        # the original paper to get a handle on the formulation of the loss function.
+        commitment_loss = self.beta * tf.reduce_mean(
+            (tf.stop_gradient(quantized) - x) ** 2
+        )
+        codebook_loss = tf.reduce_mean((quantized - tf.stop_gradient(x)) ** 2)
+        self.add_loss(commitment_loss + codebook_loss)
+        # Straight-through estimator.
+        quantized = x + tf.stop_gradient(quantized - x)
+        return quantized
+    def get_code_indices(self, flattened_inputs):
+        # Calculate L2-normalized distance between the inputs and the codes.
+        similarity = tf.matmul(flattened_inputs, self.embeddings)
+        distances = (
+            tf.reduce_sum(flattened_inputs ** 2, axis=1, keepdims=True)
+            + tf.reduce_sum(self.embeddings ** 2, axis=0)
+            - 2 * similarity
+        )
+        # Derive the indices for minimum distances.
+        encoding_indices = tf.argmin(distances, axis=1)
+        return encoding_indices
+class VQVAE(keras.Model):
+    def __init__(self, train_variance:float, latent_dim:int=32, num_embeddings:int=128, **kwargs):
+        super().__init__(**kwargs)
+        self.train_variance = train_variance
+        self.latent_dim = latent_dim
+        self.num_embeddings = num_embeddings
+        self.vqvae = self.get_vqvae()
+        self.total_loss_tracker = keras.metrics.Mean(name="total_loss")
+        self.reconstruction_loss_tracker = keras.metrics.Mean(
+            name="reconstruction_loss"
+        )
+        self.vq_loss_tracker = keras.metrics.Mean(name="vq_loss")
+    def get_encoder(self):
+        encoder_inputs = keras.Input(shape=input_shape)
+        x = layers.TimeDistributed(layers.Conv2D(32, 3, activation="relu", strides=2, padding="same"))(
+            encoder_inputs
+        )
+        x = layers.TimeDistributed(layers.Conv2D(64, 3, activation="relu", strides=2, padding="same"))(x)
+        encoder_outputs = layers.TimeDistributed(layers.Conv2D(self.latent_dim, 1, padding="same"))(x)
+        return keras.Model(encoder_inputs, encoder_outputs, name="encoder")
+    def get_decoder(self):
+        latent_inputs = keras.Input(shape=self.get_encoder().output.shape[1:])
+        x = layers.TimeDistributed(layers.Conv2DTranspose(64, 3, activation="relu", strides=2, padding="same"))(
+            latent_inputs
+        )
+        x = layers.TimeDistributed(layers.Conv2DTranspose(32, 3, activation="relu", strides=2, padding="same"))(x)
+        decoder_outputs = layers.TimeDistributed(layers.Conv2DTranspose(1, 3, padding="same"))(x)
+        return keras.Model(latent_inputs, decoder_outputs, name="decoder")
+    def get_vqvae(self):
+        self.vq_layer = VectorQuantizer(self.num_embeddings, self.latent_dim, name="vector_quantizer")
+        self.encoder = self.get_encoder()
+        self.decoder = self.get_decoder()
+        inputs = keras.Input(shape=input_shape)
+        encoder_outputs = self.encoder(inputs)
+        quantized_latents = self.vq_layer(encoder_outputs)
+        reconstructions = self.decoder(quantized_latents)
+        return keras.Model(inputs, reconstructions, name="vq_vae")
+    def train_step(self, data):
+        x, y = data
+        with tf.GradientTape() as tape:
+            # Outputs from the VQ-VAE.
+            reconstructions = self.vqvae(x)
+            # Calculate the losses.
+            reconstruction_loss = (
+                tf.reduce_mean((y - reconstructions) ** 2) / self.train_variance
+            )
+            total_loss = reconstruction_loss + sum(self.vqvae.losses)
+        # Backpropagation.
+        grads = tape.gradient(total_loss, self.vqvae.trainable_variables)
+        self.optimizer.apply_gradients(zip(grads, self.vqvae.trainable_variables))
+        # Loss tracking.
+        self.total_loss_tracker.update_state(total_loss)
+        self.reconstruction_loss_tracker.update_state(reconstruction_loss)
+        self.vq_loss_tracker.update_state(sum(self.vqvae.losses))
+        # Log results.
+        return {
+            "loss": self.total_loss_tracker.result(),
+            "reconstruction_loss": self.reconstruction_loss_tracker.result(),
+            "vqvae_loss": self.vq_loss_tracker.result(),
+        }
+    def call(self, inputs, training=False, mask=None):
+        return self.vqvae(inputs)

ganime/model/vqgan/__init__.py ADDED Viewed

File without changes

ganime/model/vqgan/discriminator/__init__.py ADDED Viewed

File without changes

ganime/model/vqgan/discriminator/model.py ADDED Viewed

	@@ -0,0 +1,64 @@

+from typing import List
+import numpy as np
+import tensorflow as tf
+from tensorflow import keras
+from tensorflow.keras import Model, Sequential
+from tensorflow.keras import layers
+class NLayerDiscriminator(Model):
+    """Defines a PatchGAN discriminator as in Pix2Pix
+    --> see https://github.com/junyanz/pytorch-CycleGAN-and-pix2pix/blob/master/models/networks.py
+    """
+    def __init__(self, input_channels: int = 3, filters: int = 64, n_layers: int = 3):
+        super().__init__()
+        kernel_size = 4
+        self.sequence = [
+            layers.Conv2D(filters, kernel_size=kernel_size, padding="same"),
+            layers.LeakyReLU(alpha=0.2),
+        ]
+        filters_mult = 1
+        for n in range(1, n_layers):
+            filters_mult = min(2**n, 8)
+            self.sequence += [
+                layers.AveragePooling2D(pool_size=2),
+                layers.Conv2D(
+                    filters * filters_mult,
+                    kernel_size=kernel_size,
+                    strides=1,  # 2,
+                    padding="same",
+                    use_bias=False,
+                ),
+                layers.BatchNormalization(),
+                layers.LeakyReLU(alpha=0.2),
+            ]
+        filters_mult = min(2**n_layers, 8)
+        self.sequence += [
+            layers.Conv2D(
+                filters * filters_mult,
+                kernel_size=kernel_size,
+                strides=1,
+                padding="same",
+                use_bias=False,
+            ),
+            layers.BatchNormalization(),
+            layers.LeakyReLU(alpha=0.2),
+        ]
+        self.sequence += [
+            layers.Conv2D(1, kernel_size=kernel_size, strides=1, padding="same")
+        ]
+        # self.main = Sequential(sequence)
+    def call(self, inputs, training=True, mask=None):
+        h = inputs
+        for seq in self.sequence:
+            h = seq(h)
+        return h
+        # return self.main(inputs)

ganime/model/vqgan/losses/__init__.py ADDED Viewed

File without changes

ganime/model/vqgan/losses/lpips.py ADDED Viewed

	@@ -0,0 +1,134 @@

+import os
+import numpy as np
+import tensorflow as tf
+import torchvision.models as models
+from tensorflow import keras
+from tensorflow.keras import Model, Sequential
+from tensorflow.keras import backend as K
+from tensorflow.keras import layers
+from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input
+from tensorflow.keras.losses import Loss
+from pyprojroot.pyprojroot import here
+def normalize_tensor(x, eps=1e-10):
+    norm_factor = tf.sqrt(tf.reduce_sum(x**2, axis=-1, keepdims=True))
+    return x / (norm_factor + eps)
+class LPIPS(Loss):
+    def __init__(self, use_dropout=True, **kwargs):
+        super().__init__(**kwargs)
+        self.scaling_layer = ScalingLayer()  # preprocess_input
+        selected_layers = [
+            "block1_conv2",
+            "block2_conv2",
+            "block3_conv3",
+            "block4_conv3",
+            "block5_conv3",
+        ]
+        # TODO here we load the same weights as pytorch, try with tensorflow weights
+        self.net = self.load_vgg16()  # VGG16(weights="imagenet", include_top=False)
+        self.net.trainable = False
+        outputs = [self.net.get_layer(layer).output for layer in selected_layers]
+        self.model = Model(self.net.input, outputs)
+        self.lins = [NetLinLayer(use_dropout=use_dropout) for _ in selected_layers]
+        # TODO: here we use the pytorch weights of the linear layers, try without these layers, or without initializing the weights
+        self(tf.zeros((1, 16, 16, 1)), tf.zeros((1, 16, 16, 1)))
+        self.init_lin_layers()
+    def load_vgg16(self) -> Model:
+        """Load a VGG16 model with the same weights as PyTorch
+        https://github.com/ezavarygin/vgg16_pytorch2keras
+        """
+        pytorch_model = models.vgg16(pretrained=True)
+        # select weights in the conv2d layers and transpose them to keras dim ordering:
+        wblist_torch = list(pytorch_model.parameters())[:26]
+        wblist_keras = []
+        for i in range(len(wblist_torch)):
+            if wblist_torch[i].dim() == 4:
+                w = np.transpose(wblist_torch[i].detach().numpy(), axes=[2, 3, 1, 0])
+                wblist_keras.append(w)
+            elif wblist_torch[i].dim() == 1:
+                b = wblist_torch[i].detach().numpy()
+                wblist_keras.append(b)
+            else:
+                raise Exception("Fully connected layers are not implemented.")
+        keras_model = VGG16(include_top=False, weights=None)
+        keras_model.set_weights(wblist_keras)
+        return keras_model
+    def init_lin_layers(self):
+        for i in range(5):
+            weights = np.load(
+                os.path.join(here(), "models", "NetLinLayer", f"numpy_{i}.npy")
+            )
+            weights = np.moveaxis(weights, 1, 2)
+            self.lins[i].model.layers[1].set_weights([weights])
+    def call(self, y_true, y_pred):
+        scaled_true = self.scaling_layer(y_true)
+        scaled_pred = self.scaling_layer(y_pred)
+        outputs_true, outputs_pred = self.model(scaled_true), self.model(scaled_pred)
+        features_true, features_pred, diffs = {}, {}, {}
+        for kk in range(len(outputs_true)):
+            features_true[kk], features_pred[kk] = normalize_tensor(
+                outputs_true[kk]
+            ), normalize_tensor(outputs_pred[kk])
+            diffs[kk] = (features_true[kk] - features_pred[kk]) ** 2
+        res = [
+            tf.reduce_mean(self.lins[kk](diffs[kk]), axis=(-3, -2), keepdims=True)
+            for kk in range(len(outputs_true))
+        ]
+        return tf.reduce_sum(res)
+        # h1_list = self.model(self.scaling_layer(y_true))
+        # h2_list = self.model(self.scaling_layer(y_pred))
+        # rc_loss = 0.0
+        # for h1, h2 in zip(h1_list, h2_list):
+        #     h1 = K.batch_flatten(h1)
+        #     h2 = K.batch_flatten(h2)
+        #     rc_loss += K.sum(K.square(h1 - h2), axis=-1)
+        # return rc_loss
+class ScalingLayer(layers.Layer):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.shift = tf.Variable([-0.030, -0.088, -0.188])
+        self.scale = tf.Variable([0.458, 0.448, 0.450])
+    def call(self, inputs):
+        return (inputs - self.shift) / self.scale
+class NetLinLayer(layers.Layer):
+    def __init__(self, channels_out=1, use_dropout=False):
+        super().__init__()
+        sequence = (
+            [
+                layers.Dropout(0.5),
+            ]
+            if use_dropout
+            else []
+        )
+        sequence += [
+            layers.Conv2D(channels_out, 1, padding="same", use_bias=False),
+        ]
+        self.model = Sequential(sequence)
+    def call(self, inputs):
+        return self.model(inputs)

ganime/model/vqgan/losses/vqperceptual.py ADDED Viewed

	@@ -0,0 +1,47 @@

+from typing import List, Literal
+import numpy as np
+import tensorflow as tf
+from tensorflow import keras
+from tensorflow.keras import Model, layers
+from tensorflow.keras.losses import Loss
+from .lpips import LPIPS
+from ..discriminator.model import NLayerDiscriminator
+class VQLPIPSWithDiscriminator(Loss):
+    def __init__(
+        self, *, pixelloss_weight: float = 1.0, perceptual_weight: float = 1.0, **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.pixelloss_weight = pixelloss_weight
+        self.perceptual_loss = LPIPS(reduction=tf.keras.losses.Reduction.NONE)
+        self.perceptual_weight = perceptual_weight
+    def call(
+        self,
+        y_true,
+        y_pred,
+    ):
+        reconstruction_loss = tf.abs(y_true - y_pred)
+        if self.perceptual_weight > 0:
+            perceptual_loss = self.perceptual_loss(y_true, y_pred)
+            reconstruction_loss += self.perceptual_weight * perceptual_loss
+        else:
+            perceptual_loss = 0.0
+        neg_log_likelihood = tf.reduce_mean(reconstruction_loss)
+        return neg_log_likelihood
+        # # GAN part
+        # if optimizer_idx == 0:
+        #     if cond is None:
+        #         assert not self.disc_conditional
+        #         logits_fake = self.discriminator(y_pred)
+        #     else:
+        #         assert self.disc_conditional
+        #         logits_fake = self.discriminator(tf.concat([y_pred, cond], axis=-1))
+        #     g_loss = -tf.reduce_mean(logits_fake)

ganime/model/vqgan/vqgan.py ADDED Viewed

	@@ -0,0 +1,722 @@

+from typing import List, Literal
+import numpy as np
+import tensorflow as tf
+from .discriminator.model import NLayerDiscriminator
+from .losses.vqperceptual import VQLPIPSWithDiscriminator
+from tensorflow import keras
+from tensorflow.keras import Model, layers, Sequential
+from tensorflow.keras.optimizers import Optimizer
+from tensorflow_addons.layers import GroupNormalization
+INPUT_SHAPE = (64, 128, 3)
+ENCODER_OUTPUT_SHAPE = (8, 8, 128)
+@tf.function
+def hinge_d_loss(logits_real, logits_fake):
+    loss_real = tf.reduce_mean(keras.activations.relu(1.0 - logits_real))
+    loss_fake = tf.reduce_mean(keras.activations.relu(1.0 + logits_fake))
+    d_loss = 0.5 * (loss_real + loss_fake)
+    return d_loss
+@tf.function
+def vanilla_d_loss(logits_real, logits_fake):
+    d_loss = 0.5 * (
+        tf.reduce_mean(keras.activations.softplus(-logits_real))
+        + tf.reduce_mean(keras.activations.softplus(logits_fake))
+    )
+    return d_loss
+class VQGAN(keras.Model):
+    def __init__(
+        self,
+        train_variance: float,
+        num_embeddings: int,
+        embedding_dim: int,
+        beta: float = 0.25,
+        z_channels: int = 128,  # 256,
+        codebook_weight: float = 1.0,
+        disc_num_layers: int = 3,
+        disc_factor: float = 1.0,
+        disc_iter_start: int = 0,
+        disc_conditional: bool = False,
+        disc_in_channels: int = 3,
+        disc_weight: float = 0.3,
+        disc_filters: int = 64,
+        disc_loss: Literal["hinge", "vanilla"] = "hinge",
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.train_variance = train_variance
+        self.codebook_weight = codebook_weight
+        self.encoder = Encoder()
+        self.decoder = Decoder()
+        self.quantize = VectorQuantizer(num_embeddings, embedding_dim, beta=beta)
+        self.quant_conv = layers.Conv2D(embedding_dim, kernel_size=1)
+        self.post_quant_conv = layers.Conv2D(z_channels, kernel_size=1)
+        self.vqvae = self.get_vqvae()
+        self.perceptual_loss = VQLPIPSWithDiscriminator(
+            reduction=tf.keras.losses.Reduction.NONE
+        )
+        self.discriminator = NLayerDiscriminator(
+            input_channels=disc_in_channels,
+            filters=disc_filters,
+            n_layers=disc_num_layers,
+        )
+        self.discriminator_iter_start = disc_iter_start
+        if disc_loss == "hinge":
+            self.disc_loss = hinge_d_loss
+        elif disc_loss == "vanilla":
+            self.disc_loss = vanilla_d_loss
+        else:
+            raise ValueError(f"Unknown GAN loss '{disc_loss}'.")
+        print(f"VQLPIPSWithDiscriminator running with {disc_loss} loss.")
+        self.disc_factor = disc_factor
+        self.discriminator_weight = disc_weight
+        self.disc_conditional = disc_conditional
+        self.total_loss_tracker = keras.metrics.Mean(name="total_loss")
+        self.reconstruction_loss_tracker = keras.metrics.Mean(
+            name="reconstruction_loss"
+        )
+        self.vq_loss_tracker = keras.metrics.Mean(name="vq_loss")
+        self.disc_loss_tracker = keras.metrics.Mean(name="disc_loss")
+        self.gen_optimizer: Optimizer = None
+        self.disc_optimizer: Optimizer = None
+    def get_vqvae(self):
+        inputs = keras.Input(shape=INPUT_SHAPE)
+        quant = self.encode(inputs)
+        reconstructed = self.decode(quant)
+        return keras.Model(inputs, reconstructed, name="vq_vae")
+    def encode(self, x):
+        h = self.encoder(x)
+        h = self.quant_conv(h)
+        return self.quantize(h)
+    def decode(self, quant):
+        quant = self.post_quant_conv(quant)
+        dec = self.decoder(quant)
+        return dec
+    def call(self, inputs, training=True, mask=None):
+        return self.vqvae(inputs)
+    def calculate_adaptive_weight(
+        self, nll_loss, g_loss, tape, trainable_vars, discriminator_weight
+    ):
+        nll_grads = tape.gradient(nll_loss, trainable_vars)[0]
+        g_grads = tape.gradient(g_loss, trainable_vars)[0]
+        d_weight = tf.norm(nll_grads) / (tf.norm(g_grads) + 1e-4)
+        d_weight = tf.stop_gradient(tf.clip_by_value(d_weight, 0.0, 1e4))
+        return d_weight * discriminator_weight
+    @tf.function
+    def adopt_weight(self, weight, global_step, threshold=0, value=0.0):
+        if global_step < threshold:
+            weight = value
+        return weight
+    def get_global_step(self, optimizer):
+        return optimizer.iterations
+    def compile(
+        self,
+        gen_optimizer,
+        disc_optimizer,
+    ):
+        super().compile()
+        self.gen_optimizer = gen_optimizer
+        self.disc_optimizer = disc_optimizer
+    def train_step(self, data):
+        x, y = data
+        # Autoencode
+        with tf.GradientTape() as tape:
+            with tf.GradientTape(persistent=True) as adaptive_tape:
+                reconstructions = self(x, training=True)
+                # Calculate the losses.
+                # reconstruction_loss = (
+                #     tf.reduce_mean((y - reconstructions) ** 2) / self.train_variance
+                # )
+                logits_fake = self.discriminator(reconstructions, training=False)
+                g_loss = -tf.reduce_mean(logits_fake)
+                nll_loss = self.perceptual_loss(y, reconstructions)
+            d_weight = self.calculate_adaptive_weight(
+                nll_loss,
+                g_loss,
+                adaptive_tape,
+                self.decoder.conv_out.trainable_variables,
+                self.discriminator_weight,
+            )
+            del adaptive_tape
+            disc_factor = self.adopt_weight(
+                weight=self.disc_factor,
+                global_step=self.get_global_step(self.gen_optimizer),
+                threshold=self.discriminator_iter_start,
+            )
+            # total_loss = reconstruction_loss + sum(self.vqvae.losses)
+            total_loss = (
+                nll_loss
+                + d_weight * disc_factor * g_loss
+                # + self.codebook_weight * tf.reduce_mean(self.vqvae.losses)
+                + self.codebook_weight * sum(self.vqvae.losses)
+            )
+        # Backpropagation.
+        grads = tape.gradient(total_loss, self.vqvae.trainable_variables)
+        self.gen_optimizer.apply_gradients(zip(grads, self.vqvae.trainable_variables))
+        # Discriminator
+        with tf.GradientTape() as disc_tape:
+            logits_real = self.discriminator(y, training=True)
+            logits_fake = self.discriminator(reconstructions, training=True)
+            disc_factor = self.adopt_weight(
+                weight=self.disc_factor,
+                global_step=self.get_global_step(self.disc_optimizer),
+                threshold=self.discriminator_iter_start,
+            )
+            d_loss = disc_factor * self.disc_loss(logits_real, logits_fake)
+        disc_grads = disc_tape.gradient(d_loss, self.discriminator.trainable_variables)
+        self.disc_optimizer.apply_gradients(
+            zip(disc_grads, self.discriminator.trainable_variables)
+        )
+        # Loss tracking.
+        self.total_loss_tracker.update_state(total_loss)
+        self.reconstruction_loss_tracker.update_state(nll_loss)
+        self.vq_loss_tracker.update_state(sum(self.vqvae.losses))
+        self.disc_loss_tracker.update_state(d_loss)
+        # Log results.
+        return {
+            "loss": self.total_loss_tracker.result(),
+            "reconstruction_loss": self.reconstruction_loss_tracker.result(),
+            "vqvae_loss": self.vq_loss_tracker.result(),
+            "disc_loss": self.disc_loss_tracker.result(),
+        }
+class VectorQuantizer(layers.Layer):
+    def __init__(self, num_embeddings, embedding_dim, beta=0.25, **kwargs):
+        super().__init__(**kwargs)
+        self.embedding_dim = embedding_dim
+        self.num_embeddings = num_embeddings
+        self.beta = (
+            beta  # This parameter is best kept between [0.25, 2] as per the paper.
+        )
+        # Initialize the embeddings which we will quantize.
+        w_init = tf.random_uniform_initializer()
+        self.embeddings = tf.Variable(
+            initial_value=w_init(
+                shape=(self.embedding_dim, self.num_embeddings)  # , dtype="float32"
+            ),
+            trainable=True,
+            name="embeddings_vqvae",
+        )
+    def call(self, x):
+        # Calculate the input shape of the inputs and
+        # then flatten the inputs keeping `embedding_dim` intact.
+        input_shape = tf.shape(x)
+        flattened = tf.reshape(x, [-1, self.embedding_dim])
+        # Quantization.
+        encoding_indices = self.get_code_indices(flattened)
+        encodings = tf.one_hot(encoding_indices, self.num_embeddings)
+        quantized = tf.matmul(encodings, self.embeddings, transpose_b=True)
+        quantized = tf.reshape(quantized, input_shape)
+        # Calculate vector quantization loss and add that to the layer. You can learn more
+        # about adding losses to different layers here:
+        # https://keras.io/guides/making_new_layers_and_models_via_subclassing/. Check
+        # the original paper to get a handle on the formulation of the loss function.
+        commitment_loss = self.beta * tf.reduce_mean(
+            (tf.stop_gradient(quantized) - x) ** 2
+        )
+        codebook_loss = tf.reduce_mean((quantized - tf.stop_gradient(x)) ** 2)
+        self.add_loss(commitment_loss + codebook_loss)
+        # Straight-through estimator.
+        quantized = x + tf.stop_gradient(quantized - x)
+        return quantized
+    def get_code_indices(self, flattened_inputs):
+        # Calculate L2-normalized distance between the inputs and the codes.
+        similarity = tf.matmul(flattened_inputs, self.embeddings)
+        distances = (
+            tf.reduce_sum(flattened_inputs**2, axis=1, keepdims=True)
+            + tf.reduce_sum(self.embeddings**2, axis=0)
+            - 2 * similarity
+        )
+        # Derive the indices for minimum distances.
+        encoding_indices = tf.argmin(distances, axis=1)
+        return encoding_indices
+class Encoder(Model):
+    def __init__(
+        self,
+        *,
+        channels: int = 128,
+        output_channels: int = 3,
+        channels_multiplier: List[int] = [1, 1, 2, 2],  # [1, 1, 2, 2, 4],
+        num_res_blocks: int = 1,  # 2,
+        attention_resolution: List[int] = [16],
+        resolution: int = 64,  # 256,
+        z_channels=128,  # 256,
+        dropout=0.0,
+        double_z=False,
+        resamp_with_conv=True,
+    ):
+        super().__init__()
+        self.channels = channels
+        self.timestep_embeddings_channel = 0
+        self.num_resolutions = len(channels_multiplier)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.conv_in = layers.Conv2D(
+            self.channels, kernel_size=3, strides=1, padding="same"
+        )
+        current_resolution = resolution
+        in_channels_multiplier = (1,) + tuple(channels_multiplier)
+        self.downsampling_list = []
+        for i_level in range(self.num_resolutions):
+            block_in = channels * in_channels_multiplier[i_level]
+            block_out = channels * channels_multiplier[i_level]
+            for i_block in range(self.num_res_blocks):
+                self.downsampling_list.append(
+                    ResnetBlock(
+                        in_channels=block_in,
+                        out_channels=block_out,
+                        timestep_embedding_channels=self.timestep_embeddings_channel,
+                        dropout=dropout,
+                    )
+                )
+                block_in = block_out
+                if current_resolution in attention_resolution:
+                    # attentions.append(layers.Attention())
+                    self.downsampling_list.append(AttentionBlock(block_in))
+            if i_level != self.num_resolutions - 1:
+                self.downsampling_list.append(Downsample(block_in, resamp_with_conv))
+        # self.downsampling = []
+        # for i_level in range(self.num_resolutions):
+        #     block = []
+        #     attentions = []
+        #     block_in = channels * in_channels_multiplier[i_level]
+        #     block_out = channels * channels_multiplier[i_level]
+        #     for i_block in range(self.num_res_blocks):
+        #         block.append(
+        #             ResnetBlock(
+        #                 in_channels=block_in,
+        #                 out_channels=block_out,
+        #                 timestep_embedding_channels=self.timestep_embeddings_channel,
+        #                 dropout=dropout,
+        #             )
+        #         )
+        #         block_in = block_out
+        #         if current_resolution in attention_resolution:
+        #             # attentions.append(layers.Attention())
+        #             attentions.append(AttentionBlock(block_in))
+        #     down = {}
+        #     down["block"] = block
+        #     down["attention"] = attentions
+        #     if i_level != self.num_resolutions - 1:
+        #         down["downsample"] = Downsample(block_in, resamp_with_conv)
+        #     self.downsampling.append(down)
+        # middle
+        self.mid = {}
+        self.mid["block_1"] = ResnetBlock(
+            in_channels=block_in,
+            out_channels=block_in,
+            timestep_embedding_channels=self.timestep_embeddings_channel,
+            dropout=dropout,
+        )
+        self.mid["attn_1"] = AttentionBlock(block_in)
+        self.mid["block_2"] = ResnetBlock(
+            in_channels=block_in,
+            out_channels=block_in,
+            timestep_embedding_channels=self.timestep_embeddings_channel,
+            dropout=dropout,
+        )
+        # end
+        self.norm_out = GroupNormalization(groups=32, epsilon=1e-6)
+        self.conv_out = layers.Conv2D(
+            2 * z_channels if double_z else z_channels,
+            kernel_size=3,
+            strides=1,
+            padding="same",
+        )
+    def summary(self):
+        x = layers.Input(shape=INPUT_SHAPE)
+        model = Model(inputs=[x], outputs=self.call(x))
+        return model.summary()
+    def call(self, inputs, training=True, mask=None):
+        h = self.conv_in(inputs)
+        for downsampling in self.downsampling_list:
+            h = downsampling(h)
+        # for i_level in range(self.num_resolutions):
+        #     for i_block in range(self.num_res_blocks):
+        #         h = self.downsampling[i_level]["block"][i_block](hs[-1])
+        #         if len(self.downsampling[i_level]["attention"]) > 0:
+        #             h = self.downsampling[i_level]["attention"][i_block](h)
+        #         hs.append(h)
+        #     if i_level != self.num_resolutions - 1:
+        #         hs.append(self.downsampling[i_level]["downsample"](hs[-1]))
+        # h = hs[-1]
+        h = self.mid["block_1"](h)
+        h = self.mid["attn_1"](h)
+        h = self.mid["block_2"](h)
+        # end
+        h = self.norm_out(h)
+        h = keras.activations.swish(h)
+        h = self.conv_out(h)
+        return h
+class Decoder(Model):
+    def __init__(
+        self,
+        *,
+        channels: int = 128,
+        output_channels: int = 3,
+        channels_multiplier: List[int] = [1, 1, 2, 2],  # [1, 1, 2, 2, 4],
+        num_res_blocks: int = 1,  # 2,
+        attention_resolution: List[int] = [16],
+        resolution: int = 64,  # 256,
+        z_channels=128,  # 256,
+        dropout=0.0,
+        give_pre_end=False,
+        resamp_with_conv=True,
+    ):
+        super().__init__()
+        self.channels = channels
+        self.timestep_embeddings_channel = 0
+        self.num_resolutions = len(channels_multiplier)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.give_pre_end = give_pre_end
+        in_channels_multiplier = (1,) + tuple(channels_multiplier)
+        block_in = channels * channels_multiplier[-1]
+        current_resolution = resolution // 2 ** (self.num_resolutions - 1)
+        self.z_shape = (1, z_channels, current_resolution, current_resolution)
+        print(
+            "Working with z of shape {} = {} dimensions.".format(
+                self.z_shape, np.prod(self.z_shape)
+            )
+        )
+        self.conv_in = layers.Conv2D(block_in, kernel_size=3, strides=1, padding="same")
+        # middle
+        self.mid = {}
+        self.mid["block_1"] = ResnetBlock(
+            in_channels=block_in,
+            out_channels=block_in,
+            timestep_embedding_channels=self.timestep_embeddings_channel,
+            dropout=dropout,
+        )
+        self.mid["attn_1"] = AttentionBlock(block_in)
+        self.mid["block_2"] = ResnetBlock(
+            in_channels=block_in,
+            out_channels=block_in,
+            timestep_embedding_channels=self.timestep_embeddings_channel,
+            dropout=dropout,
+        )
+        # upsampling
+        self.upsampling_list = []
+        for i_level in reversed(range(self.num_resolutions)):
+            block_out = channels * channels_multiplier[i_level]
+            for i_block in range(self.num_res_blocks + 1):
+                self.upsampling_list.append(
+                    ResnetBlock(
+                        in_channels=block_in,
+                        out_channels=block_out,
+                        timestep_embedding_channels=self.timestep_embeddings_channel,
+                        dropout=dropout,
+                    )
+                )
+                block_in = block_out
+                if current_resolution in attention_resolution:
+                    # attentions.append(layers.Attention())
+                    self.upsampling_list.append(AttentionBlock(block_in))
+            if i_level != 0:
+                self.upsampling_list.append(Upsample(block_in, resamp_with_conv))
+                current_resolution *= 2
+            # self.upsampling.insert(0, upsampling)
+        # self.upsampling = []
+        # for i_level in reversed(range(self.num_resolutions)):
+        #     block = []
+        #     attentions = []
+        #     block_out = channels * channels_multiplier[i_level]
+        #     for i_block in range(self.num_res_blocks + 1):
+        #         block.append(
+        #             ResnetBlock(
+        #                 in_channels=block_in,
+        #                 out_channels=block_out,
+        #                 timestep_embedding_channels=self.timestep_embeddings_channel,
+        #                 dropout=dropout,
+        #             )
+        #         )
+        #         block_in = block_out
+        #         if current_resolution in attention_resolution:
+        #             # attentions.append(layers.Attention())
+        #             attentions.append(AttentionBlock(block_in))
+        #     upsampling = {}
+        #     upsampling["block"] = block
+        #     upsampling["attention"] = attentions
+        #     if i_level != 0:
+        #         upsampling["upsample"] = Upsample(block_in, resamp_with_conv)
+        #         current_resolution *= 2
+        #     self.upsampling.insert(0, upsampling)
+        # end
+        self.norm_out = GroupNormalization(groups=32, epsilon=1e-6)
+        self.conv_out = layers.Conv2D(
+            output_channels,
+            kernel_size=3,
+            strides=1,
+            activation="sigmoid",
+            padding="same",
+        )
+    def summary(self):
+        x = layers.Input(shape=ENCODER_OUTPUT_SHAPE)
+        model = Model(inputs=[x], outputs=self.call(x))
+        return model.summary()
+    def call(self, inputs, training=True, mask=None):
+        h = self.conv_in(inputs)
+        # middle
+        h = self.mid["block_1"](h)
+        h = self.mid["attn_1"](h)
+        h = self.mid["block_2"](h)
+        for upsampling in self.upsampling_list:
+            h = upsampling(h)
+        # for i_level in reversed(range(self.num_resolutions)):
+        #     for i_block in range(self.num_res_blocks + 1):
+        #         h = self.upsampling[i_level]["block"][i_block](h)
+        #         if len(self.upsampling[i_level]["attention"]) > 0:
+        #             h = self.upsampling[i_level]["attention"][i_block](h)
+        #     if i_level != 0:
+        #         h = self.upsampling[i_level]["upsample"](h)
+        # end
+        if self.give_pre_end:
+            return h
+        h = self.norm_out(h)
+        h = keras.activations.swish(h)
+        h = self.conv_out(h)
+        return h
+class ResnetBlock(layers.Layer):
+    def __init__(
+        self,
+        *,
+        in_channels,
+        dropout=0.0,
+        out_channels=None,
+        conv_shortcut=False,
+        timestep_embedding_channels=512,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.use_conv_shortcut = conv_shortcut
+        self.norm1 = GroupNormalization(groups=32, epsilon=1e-6)
+        self.conv1 = layers.Conv2D(
+            out_channels, kernel_size=3, strides=1, padding="same"
+        )
+        if timestep_embedding_channels > 0:
+            self.timestep_embedding_projection = layers.Dense(out_channels)
+        self.norm2 = GroupNormalization(groups=32, epsilon=1e-6)
+        self.dropout = layers.Dropout(dropout)
+        self.conv2 = layers.Conv2D(
+            out_channels, kernel_size=3, strides=1, padding="same"
+        )
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                self.conv_shortcut = layers.Conv2D(
+                    out_channels, kernel_size=3, strides=1, padding="same"
+                )
+            else:
+                self.nin_shortcut = layers.Conv2D(
+                    out_channels, kernel_size=1, strides=1, padding="valid"
+                )
+    def call(self, x):
+        h = x
+        h = self.norm1(h)
+        h = keras.activations.swish(h)
+        h = self.conv1(h)
+        # if timestamp_embedding is not None:
+        #     h = h + self.timestep_embedding_projection(keras.activations.swish(timestamp_embedding))
+        h = self.norm2(h)
+        h = keras.activations.swish(h)
+        h = self.dropout(h)
+        h = self.conv2(h)
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                x = self.conv_shortcut(x)
+            else:
+                x = self.nin_shortcut(x)
+        return x + h
+class AttentionBlock(layers.Layer):
+    def __init__(self, channels):
+        super().__init__()
+        self.norm = GroupNormalization(groups=32, epsilon=1e-6)
+        self.q = layers.Conv2D(channels, kernel_size=1, strides=1, padding="valid")
+        self.k = layers.Conv2D(channels, kernel_size=1, strides=1, padding="valid")
+        self.v = layers.Conv2D(channels, kernel_size=1, strides=1, padding="valid")
+        self.proj_out = layers.Conv2D(
+            channels, kernel_size=1, strides=1, padding="valid"
+        )
+    def call(self, x):
+        h_ = x
+        h_ = self.norm(h_)
+        q = self.q(h_)
+        k = self.k(h_)
+        v = self.v(h_)
+        # compute attention
+        (
+            b,
+            h,
+            w,
+            c,
+        ) = q.shape
+        if b is None:
+            b = -1
+        q = tf.reshape(q, [b, h * w, c])
+        k = tf.reshape(k, [b, h * w, c])
+        w_ = tf.matmul(
+            q, k, transpose_b=True
+        )  # b,hw,hw    w[b,i,j]=sum_c q[b,i,c]k[b,c,j]
+        w_ = w_ * (int(c) ** (-0.5))
+        w_ = keras.activations.softmax(w_)
+        # attend to values
+        v = tf.reshape(v, [b, h * w, c])
+        # w_ = w_.permute(0, 2, 1)  # b,hw,hw (first hw of k, second of q)
+        h_ = tf.matmul(
+            v, w_, transpose_a=True
+        )  # b, c,hw (hw of q) h_[b,c,j] = sum_i v[b,c,i] w_[b,i,j]
+        # h_ = h_.reshape(b, c, h, w)
+        h_ = tf.reshape(h_, [b, h, w, c])
+        h_ = self.proj_out(h_)
+        return x + h_
+class Downsample(layers.Layer):
+    def __init__(self, channels, with_conv=True):
+        super().__init__()
+        self.with_conv = with_conv
+        if self.with_conv:
+            # no asymmetric padding in torch conv, must do it ourselves
+            self.down_sample = layers.Conv2D(
+                channels, kernel_size=3, strides=2, padding="same"
+            )
+        else:
+            self.down_sample = layers.AveragePooling2D(pool_size=2, strides=2)
+    def call(self, x):
+        x = self.down_sample(x)
+        return x
+class Upsample(layers.Layer):
+    def __init__(self, channels, with_conv=False):
+        super().__init__()
+        self.with_conv = with_conv
+        if False:  # self.with_conv:
+            self.up_sample = layers.Conv2DTranspose(
+                channels, kernel_size=3, strides=2, padding="same"
+            )
+        else:
+            self.up_sample = Sequential(
+                [
+                    layers.UpSampling2D(size=2, interpolation="nearest"),
+                    layers.Conv2D(channels, kernel_size=3, strides=1, padding="same"),
+                ]
+            )
+    def call(self, x):
+        x = self.up_sample(x)
+        return x

ganime/model/vqgan_clean/__init__.py ADDED Viewed

File without changes

ganime/model/vqgan_clean/diffusion/__init__.py ADDED Viewed

File without changes

ganime/model/vqgan_clean/diffusion/decoder.py ADDED Viewed

	@@ -0,0 +1,115 @@

+from typing import List
+import numpy as np
+import tensorflow as tf
+from tensorflow import keras
+from tensorflow.keras import Model, layers
+from tensorflow_addons.layers import GroupNormalization
+from .layers import AttentionBlock, ResnetBlock, Upsample
+# @tf.keras.utils.register_keras_serializable()
+class Decoder(layers.Layer):
+    def __init__(
+        self,
+        *,
+        channels: int,
+        output_channels: int = 3,
+        channels_multiplier: List[int],
+        num_res_blocks: int,
+        attention_resolution: List[int],
+        resolution: int,
+        z_channels: int,
+        dropout: float,
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.channels = channels
+        self.output_channels = output_channels
+        self.channels_multiplier = channels_multiplier
+        self.num_resolutions = len(channels_multiplier)
+        self.num_res_blocks = num_res_blocks
+        self.attention_resolution = attention_resolution
+        self.resolution = resolution
+        self.z_channels = z_channels
+        self.dropout = dropout
+        block_in = channels * channels_multiplier[-1]
+        current_resolution = resolution // 2 ** (self.num_resolutions - 1)
+        self.z_shape = (1, z_channels, current_resolution, current_resolution)
+        print(
+            "Working with z of shape {} = {} dimensions.".format(
+                self.z_shape, np.prod(self.z_shape)
+            )
+        )
+        self.conv_in = layers.Conv2D(block_in, kernel_size=3, strides=1, padding="same")
+        # middle
+        self.mid = {}
+        self.mid["block_1"] = ResnetBlock(
+            in_channels=block_in,
+            out_channels=block_in,
+            dropout=dropout,
+        )
+        self.mid["attn_1"] = AttentionBlock(block_in)
+        self.mid["block_2"] = ResnetBlock(
+            in_channels=block_in,
+            out_channels=block_in,
+            dropout=dropout,
+        )
+        # upsampling
+        self.upsampling_list = []
+        for i_level in reversed(range(self.num_resolutions)):
+            block_out = channels * channels_multiplier[i_level]
+            for i_block in range(self.num_res_blocks + 1):
+                self.upsampling_list.append(
+                    ResnetBlock(
+                        in_channels=block_in,
+                        out_channels=block_out,
+                        dropout=dropout,
+                    )
+                )
+                block_in = block_out
+                if current_resolution in attention_resolution:
+                    # attentions.append(layers.Attention())
+                    self.upsampling_list.append(AttentionBlock(block_in))
+            if i_level != 0:
+                self.upsampling_list.append(Upsample(block_in))
+                current_resolution *= 2
+        # end
+        self.norm_out = GroupNormalization(groups=32, epsilon=1e-6)
+        self.conv_out = layers.Conv2D(
+            output_channels,
+            kernel_size=3,
+            strides=1,
+            activation="tanh",
+            padding="same",
+        )
+    def call(self, inputs, training=True, mask=None):
+        h = self.conv_in(inputs)
+        # middle
+        h = self.mid["block_1"](h)
+        h = self.mid["attn_1"](h)
+        h = self.mid["block_2"](h)
+        for upsampling in self.upsampling_list:
+            h = upsampling(h)
+        # end
+        h = self.norm_out(h)
+        h = keras.activations.swish(h)
+        h = self.conv_out(h)
+        return h

ganime/model/vqgan_clean/diffusion/encoder.py ADDED Viewed

	@@ -0,0 +1,125 @@

+from typing import List
+import tensorflow as tf
+from tensorflow import keras
+from tensorflow.keras import layers, Model
+from tensorflow_addons.layers import GroupNormalization
+from .layers import ResnetBlock, AttentionBlock, Downsample
+# @tf.keras.utils.register_keras_serializable()
+class Encoder(layers.Layer):
+    def __init__(
+        self,
+        *,
+        channels: int,
+        channels_multiplier: List[int],
+        num_res_blocks: int,
+        attention_resolution: List[int],
+        resolution: int,
+        z_channels: int,
+        dropout: float,
+        **kwargs
+    ):
+        """Encode an image into a latent vector. The encoder will be constitued of multiple levels (lenght of `channels_multiplier`) with for each level `num_res_blocks` ResnetBlock.
+        Args:
+            channels (int, optional): The number of channel for the first layer. Defaults to 128.
+            channels_multiplier (List[int], optional): The channel multiplier for each level (previous level channels X multipler). Defaults to [1, 1, 2, 2].
+            num_res_blocks (int, optional): Number of ResnetBlock at each level. Defaults to 1.
+            attention_resolution (List[int], optional): Add an attention block if the current resolution is in this array. Defaults to [16].
+            resolution (int, optional): The starting resolution. Defaults to 64.
+            z_channels (int, optional): The number of channel at the end of the encoder. Defaults to 128.
+            dropout (float, optional): The dropout ratio for each ResnetBlock. Defaults to 0.0.
+        """
+        super().__init__(**kwargs)
+        self.channels = channels
+        self.channels_multiplier = channels_multiplier
+        self.num_resolutions = len(channels_multiplier)
+        self.num_res_blocks = num_res_blocks
+        self.attention_resolution = attention_resolution
+        self.resolution = resolution
+        self.z_channels = z_channels
+        self.dropout = dropout
+        self.conv_in = layers.Conv2D(
+            self.channels, kernel_size=3, strides=1, padding="same"
+        )
+        current_resolution = resolution
+        in_channels_multiplier = (1,) + tuple(channels_multiplier)
+        self.downsampling_list = []
+        for i_level in range(self.num_resolutions):
+            block_in = channels * in_channels_multiplier[i_level]
+            block_out = channels * channels_multiplier[i_level]
+            for i_block in range(self.num_res_blocks):
+                self.downsampling_list.append(
+                    ResnetBlock(
+                        in_channels=block_in,
+                        out_channels=block_out,
+                        dropout=dropout,
+                    )
+                )
+                block_in = block_out
+                if current_resolution in attention_resolution:
+                    self.downsampling_list.append(AttentionBlock(block_in))
+            if i_level != self.num_resolutions - 1:
+                self.downsampling_list.append(Downsample(block_in))
+                current_resolution = current_resolution // 2
+        # middle
+        self.mid = {}
+        self.mid["block_1"] = ResnetBlock(
+            in_channels=block_in,
+            out_channels=block_in,
+            dropout=dropout,
+        )
+        self.mid["attn_1"] = AttentionBlock(block_in)
+        self.mid["block_2"] = ResnetBlock(
+            in_channels=block_in,
+            out_channels=block_in,
+            dropout=dropout,
+        )
+        # end
+        self.norm_out = GroupNormalization(groups=32, epsilon=1e-6)
+        self.conv_out = layers.Conv2D(
+            z_channels,
+            kernel_size=3,
+            strides=1,
+            padding="same",
+        )
+    # def get_config(self):
+    #     config = super().get_config()
+    #     config.update(
+    #         {
+    #             "channels": self.channels,
+    #             "channels_multiplier": self.channels_multiplier,
+    #             "num_res_blocks": self.num_res_blocks,
+    #             "attention_resolution": self.attention_resolution,
+    #             "resolution": self.resolution,
+    #             "z_channels": self.z_channels,
+    #             "dropout": self.dropout,
+    #         }
+    #     )
+    #     return config
+    def call(self, inputs, training=True, mask=None):
+        h = self.conv_in(inputs)
+        for downsampling in self.downsampling_list:
+            h = downsampling(h)
+        h = self.mid["block_1"](h)
+        h = self.mid["attn_1"](h)
+        h = self.mid["block_2"](h)
+        # end
+        h = self.norm_out(h)
+        h = keras.activations.swish(h)
+        h = self.conv_out(h)
+        return h

ganime/model/vqgan_clean/diffusion/layers.py ADDED Viewed

	@@ -0,0 +1,179 @@

+import tensorflow as tf
+from tensorflow import keras
+from tensorflow.keras import layers, Sequential
+from tensorflow_addons.layers import GroupNormalization
+@tf.keras.utils.register_keras_serializable()
+class ResnetBlock(layers.Layer):
+    def __init__(
+        self,
+        *,
+        in_channels,
+        dropout=0.0,
+        out_channels=None,
+        conv_shortcut=False,
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.in_channels = in_channels
+        self.dropout_rate = dropout
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.use_conv_shortcut = conv_shortcut
+        self.norm1 = GroupNormalization(groups=32, epsilon=1e-6)
+        self.conv1 = layers.Conv2D(
+            out_channels, kernel_size=3, strides=1, padding="same"
+        )
+        self.norm2 = GroupNormalization(groups=32, epsilon=1e-6)
+        self.dropout = layers.Dropout(dropout)
+        self.conv2 = layers.Conv2D(
+            out_channels, kernel_size=3, strides=1, padding="same"
+        )
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                self.conv_shortcut = layers.Conv2D(
+                    out_channels, kernel_size=3, strides=1, padding="same"
+                )
+            else:
+                self.nin_shortcut = layers.Conv2D(
+                    out_channels, kernel_size=1, strides=1, padding="valid"
+                )
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {
+                "in_channels": self.in_channels,
+                "dropout": self.dropout_rate,
+                "out_channels": self.out_channels,
+                "conv_shortcut": self.use_conv_shortcut,
+            }
+        )
+        return config
+    def call(self, x):
+        h = x
+        h = self.norm1(h)
+        h = keras.activations.swish(h)
+        h = self.conv1(h)
+        h = self.norm2(h)
+        h = keras.activations.swish(h)
+        h = self.dropout(h)
+        h = self.conv2(h)
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                x = self.conv_shortcut(x)
+            else:
+                x = self.nin_shortcut(x)
+        return x + h
+@tf.keras.utils.register_keras_serializable()
+class AttentionBlock(layers.Layer):
+    def __init__(self, channels, **kwargs):
+        super().__init__(**kwargs)
+        self.channels = channels
+        self.norm = GroupNormalization(groups=32, epsilon=1e-6)
+        self.q = layers.Conv2D(channels, kernel_size=1, strides=1, padding="valid")
+        self.k = layers.Conv2D(channels, kernel_size=1, strides=1, padding="valid")
+        self.v = layers.Conv2D(channels, kernel_size=1, strides=1, padding="valid")
+        self.proj_out = layers.Conv2D(
+            channels, kernel_size=1, strides=1, padding="valid"
+        )
+        self.attention = layers.Attention()
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {
+                "channels": self.channels,
+            }
+        )
+        return config
+    def call(self, x):
+        h_ = x
+        h_ = self.norm(h_)
+        q = self.q(h_)
+        k = self.k(h_)
+        v = self.v(h_)
+        # compute attention
+        (b, h, w, c,) = (
+            tf.shape(q)[0],
+            tf.shape(q)[1],
+            tf.shape(q)[2],
+            tf.shape(q)[3],
+        )
+        if b is None:
+            b = -1
+        q = tf.reshape(q, [b, h * w, c])
+        k = tf.reshape(k, [b, h * w, c])
+        v = tf.reshape(v, [b, h * w, c])
+        h_ = self.attention([q, v, k])
+        h_ = tf.reshape(h_, [b, h, w, c])
+        h_ = self.proj_out(h_)
+        return x + h_
+@tf.keras.utils.register_keras_serializable()
+class Downsample(layers.Layer):
+    def __init__(self, channels, **kwargs):
+        super().__init__(**kwargs)
+        self.channels = channels
+        self.down_sample = self.down_sample = layers.AveragePooling2D(
+            pool_size=2, strides=2
+        )
+        self.conv = layers.Conv2D(channels, kernel_size=3, strides=1, padding="same")
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {
+                "channels": self.channels,
+            }
+        )
+        return config
+    def call(self, x):
+        x = self.down_sample(x)
+        x = self.conv(x)
+        return x
+@tf.keras.utils.register_keras_serializable()
+class Upsample(layers.Layer):
+    def __init__(self, channels, **kwargs):
+        super().__init__(**kwargs)
+        self.channels = channels
+        self.up_sample = layers.UpSampling2D(size=2, interpolation="bilinear")
+        self.conv = layers.Conv2D(channels, kernel_size=3, strides=1, padding="same")
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {
+                "channels": self.channels,
+            }
+        )
+        return config
+    def call(self, x):
+        x = self.up_sample(x)
+        x = self.conv(x)
+        return x

ganime/model/vqgan_clean/discriminator/__init__.py ADDED Viewed

File without changes

ganime/model/vqgan_clean/discriminator/model.py ADDED Viewed

	@@ -0,0 +1,88 @@

+from typing import List
+import numpy as np
+import tensorflow as tf
+from tensorflow import keras
+from tensorflow.keras import Model, Sequential
+from tensorflow.keras import layers
+from tensorflow.keras.initializers import RandomNormal
+class NLayerDiscriminator(Model):
+    """Defines a PatchGAN discriminator as in Pix2Pix
+    --> see https://github.com/junyanz/pytorch-CycleGAN-and-pix2pix/blob/master/models/networks.py
+    """
+    def __init__(self, filters: int = 64, n_layers: int = 3, **kwargs):
+        super().__init__(**kwargs)
+        init = RandomNormal(stddev=0.02)
+        self.filters = filters
+        self.n_layers = n_layers
+        kernel_size = 4
+        inp = tf.keras.layers.Input(shape=[256, 512, 3], name="input_image")
+        tar = tf.keras.layers.Input(shape=[256, 512, 3], name="target_image")
+        x = tf.keras.layers.concatenate([inp, tar])
+        x = layers.Conv2D(
+            filters,
+            kernel_size=kernel_size,
+            strides=2,
+            # strides=1,
+            padding="same",
+            kernel_initializer=init,
+        )(x)
+        x = layers.LeakyReLU(alpha=0.2)(x)
+        filters_mult = 1
+        for n in range(1, n_layers):
+            filters_mult = min(2**n, 8)
+            x = layers.Conv2D(
+                filters * filters_mult,
+                kernel_size=kernel_size,
+                # strides=1,  # 2,
+                strides=2,
+                padding="same",
+                use_bias=False,
+                kernel_initializer=init,
+            )(x)
+            x = layers.BatchNormalization()(x)
+            x = layers.LeakyReLU(alpha=0.2)(x)
+        filters_mult = min(2**n_layers, 8)
+        x = layers.Conv2D(
+            filters * filters_mult,
+            kernel_size=kernel_size,
+            strides=1,
+            padding="same",
+            use_bias=False,
+            kernel_initializer=init,
+        )(x)
+        x = layers.BatchNormalization()(x)
+        x = layers.LeakyReLU(alpha=0.2)(x)
+        x = layers.Conv2D(
+            1,
+            kernel_size=kernel_size,
+            strides=1,
+            padding="same",
+            # activation="sigmoid",
+            kernel_initializer=init,
+        )(x)
+        self.model = tf.keras.Model(inputs=[inp, tar], outputs=x)
+    def call(self, inputs, training=True, mask=None):
+        return self.model(inputs)
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {
+                "filters": self.filters,
+                "n_layers": self.n_layers,
+            }
+        )
+        return config

ganime/model/vqgan_clean/discriminator/model_bkp.py ADDED Viewed

	@@ -0,0 +1,76 @@

+from typing import List
+import numpy as np
+import tensorflow as tf
+from tensorflow import keras
+from tensorflow.keras import Model, Sequential
+from tensorflow.keras import layers
+class NLayerDiscriminator(Model):
+    """Defines a PatchGAN discriminator as in Pix2Pix
+    --> see https://github.com/junyanz/pytorch-CycleGAN-and-pix2pix/blob/master/models/networks.py
+    """
+    def __init__(self, filters: int = 64, n_layers: int = 3, **kwargs):
+        super().__init__(**kwargs)
+        self.filters = filters
+        self.n_layers = n_layers
+        kernel_size = 4
+        self.sequence = [
+            layers.Conv2D(filters, kernel_size=kernel_size, strides=1, padding="same"),
+            layers.LeakyReLU(alpha=0.2),
+        ]
+        filters_mult = 1
+        for n in range(1, n_layers):
+            filters_mult = min(2**n, 8)
+            self.sequence += [
+                layers.AveragePooling2D(pool_size=2),
+                layers.Conv2D(
+                    filters * filters_mult,
+                    kernel_size=kernel_size,
+                    strides=1,  # 2,
+                    # strides=2,
+                    padding="same",
+                    use_bias=False,
+                ),
+                layers.BatchNormalization(),
+                layers.LeakyReLU(alpha=0.2),
+            ]
+        filters_mult = min(2**n_layers, 8)
+        self.sequence += [
+            layers.AveragePooling2D(pool_size=2),
+            layers.Conv2D(
+                filters * filters_mult,
+                kernel_size=kernel_size,
+                strides=1,
+                padding="same",
+                use_bias=False,
+            ),
+            layers.BatchNormalization(),
+            layers.LeakyReLU(alpha=0.2),
+        ]
+        self.sequence += [
+            layers.Conv2D(1, kernel_size=kernel_size, strides=1, padding="same")
+        ]
+    def call(self, inputs, training=True, mask=None):
+        h = inputs
+        for seq in self.sequence:
+            h = seq(h)
+        return h
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {
+                "filters": self.filters,
+                "n_layers": self.n_layers,
+            }
+        )
+        return config

ganime/model/vqgan_clean/experimental/gpt2_embedding.py ADDED Viewed

	@@ -0,0 +1,1127 @@

+# coding=utf-8
+# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" TF 2.0 OpenAI GPT-2 model."""
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+import numpy as np
+import tensorflow as tf
+from tensorflow.compiler.tf2xla.python.xla import dynamic_update_slice
+from transformers.activations_tf import get_tf_activation
+from transformers.modeling_tf_outputs import (
+    TFBaseModelOutputWithPastAndCrossAttentions,
+    TFCausalLMOutputWithCrossAttentions,
+    TFSequenceClassifierOutputWithPast,
+)
+from transformers.modeling_tf_utils import (
+    TFCausalLanguageModelingLoss,
+    TFConv1D,
+    TFModelInputType,
+    TFPreTrainedModel,
+    TFSequenceClassificationLoss,
+    TFSequenceSummary,
+    TFSharedEmbeddings,
+    get_initializer,
+    keras_serializable,
+    unpack_inputs,
+)
+from transformers.tf_utils import shape_list, stable_softmax
+from transformers.utils import (
+    DUMMY_INPUTS,
+    ModelOutput,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from transformers import GPT2Config
+logger = logging.get_logger(__name__)
+_CHECKPOINT_FOR_DOC = "gpt2"
+_CONFIG_FOR_DOC = "GPT2Config"
+_TOKENIZER_FOR_DOC = "GPT2Tokenizer"
+TF_GPT2_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "gpt2",
+    "gpt2-medium",
+    "gpt2-large",
+    "gpt2-xl",
+    "distilgpt2",
+    # See all GPT-2 models at https://huggingface.co/models?filter=gpt2
+]
+class TFAttention(tf.keras.layers.Layer):
+    def __init__(self, nx, config, scale=False, is_cross_attention=False, **kwargs):
+        super().__init__(**kwargs)
+        n_state = nx  # in Attention: n_state=768 (nx=n_embd)
+        # [switch nx => n_state from Block to Attention to keep identical to TF implementation]
+        assert n_state % config.n_head == 0
+        self.n_head = config.n_head
+        self.split_size = n_state
+        self.scale = scale
+        self.output_attentions = config.output_attentions
+        self.is_cross_attention = is_cross_attention
+        if self.is_cross_attention:
+            self.c_attn = TFConv1D(
+                n_state * 2,
+                nx,
+                initializer_range=config.initializer_range,
+                name="c_attn",
+            )
+            self.q_attn = TFConv1D(
+                n_state, nx, initializer_range=config.initializer_range, name="q_attn"
+            )
+        else:
+            self.c_attn = TFConv1D(
+                n_state * 3,
+                nx,
+                initializer_range=config.initializer_range,
+                name="c_attn",
+            )
+        self.c_proj = TFConv1D(
+            n_state, nx, initializer_range=config.initializer_range, name="c_proj"
+        )
+        self.attn_dropout = tf.keras.layers.Dropout(config.attn_pdrop)
+        self.resid_dropout = tf.keras.layers.Dropout(config.resid_pdrop)
+        self.pruned_heads = set()
+    def prune_heads(self, heads):
+        pass
+    @staticmethod
+    def causal_attention_mask(nd, ns, dtype):
+        """
+        1's in the lower triangle, counting from the lower right corner. Same as tf.matrix_band_part(tf.ones([nd, ns]),
+        -1, ns-nd), but doesn't produce garbage on TPUs.
+        """
+        i = tf.range(nd)[:, None]
+        j = tf.range(ns)
+        m = i >= j - ns + nd
+        return tf.cast(m, dtype)
+    def _attn(
+        self, q, k, v, attention_mask, head_mask, output_attentions, training=False
+    ):
+        # q, k, v have shape [batch, heads, sequence, features]
+        w = tf.matmul(q, k, transpose_b=True)
+        if self.scale:
+            dk = tf.cast(shape_list(k)[-1], dtype=w.dtype)  # scale attention_scores
+            w = w / tf.math.sqrt(dk)
+        if not self.is_cross_attention:
+            # if only "normal" attention layer implements causal mask
+            # w has shape [batch, heads, dst_sequence, src_sequence], where information flows from src to dst.
+            _, _, nd, ns = shape_list(w)
+            b = self.causal_attention_mask(nd, ns, dtype=w.dtype)
+            b = tf.reshape(b, [1, 1, nd, ns])
+            w = w * b - 1e4 * (1 - b)
+        if attention_mask is not None:
+            # Apply the attention mask
+            attention_mask = tf.cast(attention_mask, dtype=w.dtype)
+            w = w + attention_mask
+        w = stable_softmax(w, axis=-1)
+        w = self.attn_dropout(w, training=training)
+        # Mask heads if we want to
+        if head_mask is not None:
+            w = w * head_mask
+        outputs = [tf.matmul(w, v)]
+        if output_attentions:
+            outputs.append(w)
+        return outputs
+    def merge_heads(self, x):
+        x = tf.transpose(x, [0, 2, 1, 3])
+        x_shape = shape_list(x)
+        new_x_shape = x_shape[:-2] + [x_shape[-2] * x_shape[-1]]
+        return tf.reshape(x, new_x_shape)
+    def split_heads(self, x):
+        x_shape = shape_list(x)
+        new_x_shape = x_shape[:-1] + [self.n_head, x_shape[-1] // self.n_head]
+        x = tf.reshape(x, new_x_shape)
+        return tf.transpose(x, (0, 2, 1, 3))  # (batch, head, seq_length, head_features)
+    def call(
+        self,
+        x,
+        layer_past,
+        attention_mask,
+        head_mask,
+        encoder_hidden_states,
+        encoder_attention_mask,
+        use_cache,
+        output_attentions,
+        training=False,
+    ):
+        if encoder_hidden_states is not None:
+            if not hasattr(self, "q_attn"):
+                raise ValueError(
+                    "If class is used as cross attention, the weights `q_attn` have to be defined. "
+                    "Please make sure to instantiate class with `GPT2Attention(..., is_cross_attention=True)`."
+                )
+            query = self.q_attn(x)
+            kv_out = self.c_attn(encoder_hidden_states)
+            key, value = tf.split(kv_out, 2, axis=2)
+            attention_mask = encoder_attention_mask
+        else:
+            x = self.c_attn(x)
+            query, key, value = tf.split(x, 3, axis=2)
+        query = self.split_heads(query)
+        key = self.split_heads(key)
+        value = self.split_heads(value)
+        if layer_past is not None:
+            past_key, past_value = tf.unstack(layer_past, axis=0)
+            key = tf.concat([past_key, key], axis=-2)
+            value = tf.concat([past_value, value], axis=-2)
+        # to cope with keras serialization
+        if use_cache:
+            present = tf.stack([key, value], axis=0)
+        else:
+            present = (None,)
+        attn_outputs = self._attn(
+            query,
+            key,
+            value,
+            attention_mask,
+            head_mask,
+            output_attentions,
+            training=training,
+        )
+        a = attn_outputs[0]
+        a = self.merge_heads(a)
+        a = self.c_proj(a)
+        a = self.resid_dropout(a, training=training)
+        outputs = [a, present] + attn_outputs[1:]
+        return outputs  # a, present, (attentions)
+class TFMLP(tf.keras.layers.Layer):
+    def __init__(self, n_state, config, **kwargs):
+        super().__init__(**kwargs)
+        nx = config.n_embd
+        self.c_fc = TFConv1D(
+            n_state, nx, initializer_range=config.initializer_range, name="c_fc"
+        )
+        self.c_proj = TFConv1D(
+            nx, n_state, initializer_range=config.initializer_range, name="c_proj"
+        )
+        self.act = get_tf_activation(config.activation_function)
+        self.dropout = tf.keras.layers.Dropout(config.resid_pdrop)
+    def call(self, x, training=False):
+        h = self.act(self.c_fc(x))
+        h2 = self.c_proj(h)
+        h2 = self.dropout(h2, training=training)
+        return h2
+class TFBlock(tf.keras.layers.Layer):
+    def __init__(self, config, scale=False, **kwargs):
+        super().__init__(**kwargs)
+        nx = config.n_embd
+        inner_dim = config.n_inner if config.n_inner is not None else 4 * nx
+        self.ln_1 = tf.keras.layers.LayerNormalization(
+            epsilon=config.layer_norm_epsilon, name="ln_1"
+        )
+        self.attn = TFAttention(nx, config, scale, name="attn")
+        self.ln_2 = tf.keras.layers.LayerNormalization(
+            epsilon=config.layer_norm_epsilon, name="ln_2"
+        )
+        if config.add_cross_attention:
+            self.crossattention = TFAttention(
+                nx, config, scale, name="crossattention", is_cross_attention=True
+            )
+            self.ln_cross_attn = tf.keras.layers.LayerNormalization(
+                epsilon=config.layer_norm_epsilon, name="ln_cross_attn"
+            )
+        self.mlp = TFMLP(inner_dim, config, name="mlp")
+    def call(
+        self,
+        x,
+        layer_past,
+        attention_mask,
+        head_mask,
+        encoder_hidden_states,
+        encoder_attention_mask,
+        use_cache,
+        output_attentions,
+        training=False,
+    ):
+        a = self.ln_1(x)
+        output_attn = self.attn(
+            a,
+            layer_past=layer_past,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=None,
+            encoder_attention_mask=None,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            training=training,
+        )
+        a = output_attn[0]  # output_attn: a, present, (attentions)
+        outputs = output_attn[1:]
+        x = x + a
+        # Cross-Attention Block
+        if encoder_hidden_states is not None:
+            # add one self-attention block for cross-attention
+            if not hasattr(self, "crossattention"):
+                raise ValueError(
+                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with "
+                    "cross-attention layers by setting `config.add_cross_attention=True`"
+                )
+            ca = self.ln_cross_attn(x)
+            output_cross_attn = self.crossattention(
+                ca,
+                layer_past=None,
+                attention_mask=attention_mask,
+                head_mask=head_mask,
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attention_mask=encoder_attention_mask,
+                use_cache=False,
+                output_attentions=output_attentions,
+                training=training,
+            )
+            ca = output_cross_attn[0]  # output_attn: a, present, (cross_attentions)
+            x = x + ca
+            outputs = (
+                outputs + output_cross_attn[2:]
+            )  # add cross attentions if we output attention weights
+        m = self.ln_2(x)
+        m = self.mlp(m, training=training)
+        x = x + m
+        outputs = [x] + outputs
+        return outputs  # x, present, (attentions, cross_attentions)
+@keras_serializable
+class TFGPT2MainLayer(tf.keras.layers.Layer):
+    config_class = GPT2Config
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(*inputs, **kwargs)
+        self.config = config
+        self.output_attentions = config.output_attentions
+        self.output_hidden_states = config.output_hidden_states
+        self.use_cache = config.use_cache
+        self.return_dict = config.use_return_dict
+        self.num_hidden_layers = config.n_layer
+        self.vocab_size = config.vocab_size
+        self.n_embd = config.n_embd
+        self.n_positions = config.n_positions
+        self.initializer_range = config.initializer_range
+        self.wte = TFSharedEmbeddings(
+            config.vocab_size,
+            config.hidden_size,
+            initializer_range=config.initializer_range,
+            name="wte",
+        )
+        self.wte_remaining_frames = TFSharedEmbeddings(
+            config.vocab_size,
+            config.hidden_size,
+            initializer_range=config.initializer_range,
+            name="wte_remaining_frames",
+        )
+        self.drop = tf.keras.layers.Dropout(config.embd_pdrop)
+        self.h = [
+            TFBlock(config, scale=True, name=f"h_._{i}") for i in range(config.n_layer)
+        ]
+        self.ln_f = tf.keras.layers.LayerNormalization(
+            epsilon=config.layer_norm_epsilon, name="ln_f"
+        )
+    def build(self, input_shape):
+        with tf.name_scope("wpe"):
+            self.wpe = self.add_weight(
+                name="embeddings",
+                shape=[self.n_positions, self.n_embd],
+                initializer=get_initializer(self.initializer_range),
+            )
+        self.wte_remaining_frames.build(input_shape)
+        super().build(input_shape)
+    def get_input_embeddings(self):
+        return self.wte
+    def get_remaining_frames_embeddings(self):
+        return self.wte_remaining_frames
+    def set_input_embeddings(self, value):
+        self.wte.weight = value
+        self.wte.vocab_size = shape_list(value)[0]
+    def set_remaining_frames_embeddings(self, value):
+        self.wte_remaining_frames.weight = value
+        self.wte_remaining_frames.vocab_size = shape_list(value)[0]
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+        """
+        raise NotImplementedError
+    @unpack_inputs
+    def call(
+        self,
+        input_ids: Optional[TFModelInputType] = None,
+        remaining_frames_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        past: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
+        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        encoder_hidden_states: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        encoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        training: Optional[bool] = False,
+    ) -> Union[TFBaseModelOutputWithPastAndCrossAttentions, Tuple[tf.Tensor]]:
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError(
+                "You cannot specify both input_ids and inputs_embeds at the same time"
+            )
+        elif input_ids is not None:
+            input_shape = shape_list(input_ids)
+            input_ids = tf.reshape(input_ids, [-1, input_shape[-1]])
+        elif inputs_embeds is not None:
+            input_shape = shape_list(inputs_embeds)[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+        if past is None:
+            past_length = 0
+            past = [None] * len(self.h)
+        else:
+            past_length = shape_list(past[0][0])[-2]
+        if position_ids is None:
+            position_ids = tf.expand_dims(
+                tf.range(past_length, input_shape[-1] + past_length), axis=0
+            )
+        if attention_mask is not None:
+            # We create a 3D attention mask from a 2D tensor mask.
+            # Sizes are [batch_size, 1, 1, to_seq_length]
+            # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
+            # this attention mask is more simple than the triangular masking of causal attention
+            # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
+            attention_mask_shape = shape_list(attention_mask)
+            attention_mask = tf.reshape(
+                attention_mask, (attention_mask_shape[0], 1, 1, attention_mask_shape[1])
+            )
+            # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+            # masked positions, this operation will create a tensor which is 0.0 for
+            # positions we want to attend and -10000.0 for masked positions.
+            # Since we are adding it to the raw scores before the softmax, this is
+            # effectively the same as removing these entirely.
+            one_cst = tf.constant(1.0)
+            attention_mask = tf.cast(attention_mask, dtype=one_cst.dtype)
+            attention_mask = tf.multiply(
+                tf.subtract(one_cst, attention_mask), tf.constant(-10000.0)
+            )
+        # Copied from `modeling_tf_t5.py` with -1e9 -> -10000
+        if self.config.add_cross_attention and encoder_attention_mask is not None:
+            # If a 2D ou 3D attention mask is provided for the cross-attention
+            # we need to make broadcastable to [batch_size, num_heads, mask_seq_length, mask_seq_length]
+            # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+            encoder_attention_mask = tf.cast(
+                encoder_attention_mask, dtype=encoder_hidden_states.dtype
+            )
+            num_dims_encoder_attention_mask = len(shape_list(encoder_attention_mask))
+            if num_dims_encoder_attention_mask == 3:
+                encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :]
+            if num_dims_encoder_attention_mask == 2:
+                encoder_extended_attention_mask = encoder_attention_mask[
+                    :, None, None, :
+                ]
+            # T5 has a mask that can compare sequence ids, we can simulate this here with this transposition
+            # Cf. https://github.com/tensorflow/mesh/blob/8d2465e9bc93129b913b5ccc6a59aa97abd96ec6/mesh_tensorflow/transformer/transformer_layers.py#L270
+            # encoder_extended_attention_mask = tf.math.equal(encoder_extended_attention_mask,
+            #                                         tf.transpose(encoder_extended_attention_mask, perm=(-1, -2)))
+            encoder_extended_attention_mask = (
+                1.0 - encoder_extended_attention_mask
+            ) * -10000.0
+        else:
+            encoder_extended_attention_mask = None
+        encoder_attention_mask = encoder_extended_attention_mask
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        if head_mask is not None:
+            raise NotImplementedError
+        else:
+            head_mask = [None] * self.num_hidden_layers
+            # head_mask = tf.constant([0] * self.num_hidden_layers)
+        position_ids = tf.reshape(position_ids, [-1, shape_list(position_ids)[-1]])
+        if inputs_embeds is None:
+            inputs_embeds = self.wte(input_ids, mode="embedding")
+        position_embeds = tf.gather(self.wpe, position_ids)
+        if token_type_ids is not None:
+            token_type_ids = tf.reshape(
+                token_type_ids, [-1, shape_list(token_type_ids)[-1]]
+            )
+            token_type_embeds = self.wte(token_type_ids, mode="embedding")
+        else:
+            token_type_embeds = tf.constant(0.0)
+        if remaining_frames_ids is not None:
+            remaining_frames_ids = tf.reshape(
+                remaining_frames_ids, [-1, shape_list(remaining_frames_ids)[-1]]
+            )
+            remaining_frames_embeds = self.wte_remaining_frames(
+                remaining_frames_ids, mode="embedding"
+            )
+        else:
+            remaining_frames_embeds = tf.constant(0.0)
+        position_embeds = tf.cast(position_embeds, dtype=inputs_embeds.dtype)
+        token_type_embeds = tf.cast(token_type_embeds, dtype=inputs_embeds.dtype)
+        remaining_frames_embeds = tf.cast(
+            remaining_frames_embeds, dtype=inputs_embeds.dtype
+        )
+        hidden_states = (
+            inputs_embeds
+            + position_embeds
+            + token_type_embeds
+            + remaining_frames_embeds
+        )
+        hidden_states = self.drop(hidden_states, training=training)
+        output_shape = input_shape + [shape_list(hidden_states)[-1]]
+        presents = () if use_cache else None
+        all_attentions = () if output_attentions else None
+        all_cross_attentions = (
+            () if output_attentions and self.config.add_cross_attention else None
+        )
+        all_hidden_states = () if output_hidden_states else None
+        for i, (block, layer_past) in enumerate(zip(self.h, past)):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (
+                    tf.reshape(hidden_states, output_shape),
+                )
+            outputs = block(
+                hidden_states,
+                layer_past,
+                attention_mask,
+                head_mask[i],
+                encoder_hidden_states,
+                encoder_attention_mask,
+                use_cache,
+                output_attentions,
+                training=training,
+            )
+            hidden_states, present = outputs[:2]
+            if use_cache:
+                presents = presents + (present,)
+            if output_attentions:
+                all_attentions = all_attentions + (outputs[2],)
+                if (
+                    self.config.add_cross_attention
+                    and encoder_hidden_states is not None
+                ):
+                    all_cross_attentions = all_cross_attentions + (outputs[3],)
+        hidden_states = self.ln_f(hidden_states)
+        hidden_states = tf.reshape(hidden_states, output_shape)
+        # Add last hidden state
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+        if output_attentions:
+            # let the number of heads free (-1) so we can extract attention even after head pruning
+            attention_output_shape = (
+                input_shape[:-1] + [-1] + shape_list(all_attentions[0])[-2:]
+            )
+            all_attentions = tuple(
+                tf.reshape(t, attention_output_shape) for t in all_attentions
+            )
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    presents,
+                    all_hidden_states,
+                    all_attentions,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        return TFBaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=presents,
+            hidden_states=all_hidden_states,
+            attentions=all_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+class TFGPT2PreTrainedModel(TFPreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+    config_class = GPT2Config
+    base_model_prefix = "transformer"
+    # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
+    _keys_to_ignore_on_load_unexpected = [
+        r"h.\d+.attn.bias",
+        r"h.\d+.crossattention.bias",
+    ]
+    @property
+    def dummy_inputs(self):
+        """
+        Dummy inputs to build the network.
+        Returns:
+            `Dict[str, tf.Tensor]`: The dummy inputs.
+        """
+        dummy = {"input_ids": tf.constant(DUMMY_INPUTS)}
+        # Add `encoder_hidden_states` to make the cross-attention layers' weights initialized
+        if self.config.add_cross_attention:
+            batch_size, seq_len = tf.constant(DUMMY_INPUTS).shape
+            shape = (batch_size, seq_len) + (self.config.hidden_size,)
+            h = tf.random.uniform(shape=shape)
+            dummy["encoder_hidden_states"] = h
+        return dummy
+    @tf.function(
+        input_signature=[
+            {
+                "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"),
+                "attention_mask": tf.TensorSpec(
+                    (None, None), tf.int32, name="attention_mask"
+                ),
+            }
+        ]
+    )
+    def serving(self, inputs):
+        output = self.call(inputs)
+        return self.serving_output(output)
+GPT2_START_DOCSTRING = r"""
+    This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+    This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
+    as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
+    behavior.
+    <Tip>
+    TF 2.0 models accepts two formats as inputs:
+    - having all inputs as keyword arguments (like PyTorch models), or
+    - having all inputs as a list, tuple or dict in the first positional arguments.
+    This second option is useful when using [`tf.keras.Model.fit`] method which currently requires having all the
+    tensors in the first argument of the model call function: `model(inputs)`.
+    If you choose this second option, there are three possibilities you can use to gather all the input Tensors in the
+    first positional argument :
+    - a single Tensor with `input_ids` only and nothing else: `model(inputs_ids)`
+    - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
+    `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
+    - a dictionary with one or several input Tensors associated to the input names given in the docstring:
+    `model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
+    </Tip>
+    Parameters:
+        config ([`GPT2Config`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+GPT2_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`Numpy array` or `tf.Tensor` of shape `(batch_size, input_ids_length)`):
+            `input_ids_length` = `sequence_length` if `past` is `None` else `past[0].shape[-2]` (`sequence_length` of
+            input past key value states). Indices of input sequence tokens in the vocabulary.
+            If `past` is used, only input IDs that do not have their past calculated should be passed as `input_ids`.
+            Indices can be obtained using [`GPT2Tokenizer`]. See [`PreTrainedTokenizer.__call__`] and
+            [`PreTrainedTokenizer.encode`] for details.
+            [What are input IDs?](../glossary#input-ids)
+        past (`List[tf.Tensor]` of length `config.n_layers`):
+            Contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model (see
+            `past` output below). Can be used to speed up sequential decoding. The token ids which have their past
+            given to this model should not be passed as input ids as they have already been computed.
+        attention_mask (`tf.Tensor` or `Numpy array` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+            If `past_key_values` is used, `attention_mask` needs to contain the masking strategy that was used for
+            `past_key_values`. In other words, the `attention_mask` always has to have the length:
+            `len(past_key_values) + len(input_ids)`
+            [What are attention masks?](../glossary#attention-mask)
+        token_type_ids (`tf.Tensor` or `Numpy array` of shape `(batch_size, sequence_length)`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+            [What are token type IDs?](../glossary#token-type-ids)
+        position_ids (`tf.Tensor` or `Numpy array` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+            [What are position IDs?](../glossary#position-ids)
+        head_mask (`Numpy array` or `tf.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        inputs_embeds (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
+            config will be used instead.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
+            used instead.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in
+            eager mode, in graph mode the value will always be set to True.
+        training (`bool`, *optional*, defaults to `False`):
+            Whether or not to use the model in training mode (some modules like dropout modules have different
+            behaviors between training and evaluation).
+"""
+@add_start_docstrings(
+    "The bare GPT2 Model transformer outputting raw hidden-states without any specific head on top.",
+    GPT2_START_DOCSTRING,
+)
+class TFGPT2Model(TFGPT2PreTrainedModel):
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.transformer = TFGPT2MainLayer(config, name="transformer")
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TFBaseModelOutputWithPastAndCrossAttentions,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def call(
+        self,
+        input_ids: Optional[TFModelInputType] = None,
+        remaining_frames_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        past: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
+        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        encoder_hidden_states: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        encoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        training: Optional[bool] = False,
+    ) -> Union[TFBaseModelOutputWithPastAndCrossAttentions, Tuple[tf.Tensor]]:
+        r"""
+        encoder_hidden_states  (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        past (`Tuple[Tuple[tf.Tensor]]` of length `config.n_layers`)
+            contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+            If `past` are used, the user can optionally input only the last `decoder_input_ids` (those that don't have
+            their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past`). Set to `False` during training, `True` during generation
+        """
+        outputs = self.transformer(
+            input_ids=input_ids,
+            remaining_frames_ids=remaining_frames_ids,
+            past=past,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        return outputs
+    def serving_output(self, output):
+        pkv = (
+            tf.convert_to_tensor(output.past_key_values)
+            if self.config.use_cache
+            else None
+        )
+        hs = (
+            tf.convert_to_tensor(output.hidden_states)
+            if self.config.output_hidden_states
+            else None
+        )
+        attns = (
+            tf.convert_to_tensor(output.attentions)
+            if self.config.output_attentions
+            else None
+        )
+        cross_attns = (
+            tf.convert_to_tensor(output.cross_attentions)
+            if self.config.output_attentions
+            and self.config.add_cross_attention
+            and output.cross_attentions is not None
+            else None
+        )
+        return TFBaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=output.last_hidden_state,
+            past_key_values=pkv,
+            hidden_states=hs,
+            attentions=attns,
+            cross_attentions=cross_attns,
+        )
+@add_start_docstrings(
+    """
+    The GPT2 Model transformer with a language modeling head on top (linear layer with weights tied to the input
+    embeddings).
+    """,
+    GPT2_START_DOCSTRING,
+)
+class TFGPT2LMHeadModel(TFGPT2PreTrainedModel, TFCausalLanguageModelingLoss):
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.transformer = TFGPT2MainLayer(config, name="transformer")
+    def get_output_embeddings(self):
+        return self.get_input_embeddings()
+    def set_output_embeddings(self, value):
+        self.set_input_embeddings(value)
+    def prepare_inputs_for_generation(
+        self, inputs, past=None, use_cache=None, use_xla=False, **kwargs
+    ):
+        # TODO: (Joao) after the TF generator is complete, update GPT2 TF generation to match PT's. NB -- some GPT2
+        # tests will need to be fixed after the change
+        # only last token for inputs_ids if past is defined in kwargs
+        if past:
+            inputs = tf.expand_dims(inputs[:, -1], -1)
+        # TODO(pvp, Joao) - this `if use_xla` statement can be removed, but is left
+        # for a future PR to not change too many things for now.
+        # All statements in this if case apply for both xla and non-xla (as they already do in PyTorch)
+        position_ids = None
+        attention_mask = None
+        if use_xla:
+            attention_mask = kwargs.get("attention_mask", None)
+            if past is not None and attention_mask is not None:
+                position_ids = tf.reduce_sum(attention_mask, axis=1, keepdims=True) - 1
+            elif attention_mask is not None:
+                position_ids = tf.math.cumsum(attention_mask, axis=1, exclusive=True)
+        return {
+            "input_ids": inputs,
+            "attention_mask": attention_mask,
+            "position_ids": position_ids,
+            "past": past,
+            "use_cache": use_cache,
+        }
+    def _update_model_kwargs_for_xla_generation(
+        self, outputs, model_kwargs, current_pos, max_length
+    ):
+        # TODO(Pvp, Joao, Matt) - this function can be cleaned a bit and refactored
+        # quite some duplicated code patterns it seems
+        # also the `attention_mask` is currently used in a somewhat hacky to
+        # correctly influence the `past_key_values` - not sure if this is the way to go
+        # Let's keep that for a future PR.
+        past = outputs.past_key_values
+        is_past_initialized = model_kwargs.pop("past", None) is not None
+        attention_mask = model_kwargs.pop("attention_mask")
+        batch_size = attention_mask.shape[0]
+        if not is_past_initialized:
+            # past[0].shape[3] is seq_length of prompt
+            num_padding_values = max_length - past[0].shape[3] - 1
+            padding_values = np.zeros((5, 2), dtype=np.int32)
+            padding_values[3, 1] = num_padding_values
+            padding_values = tf.constant(padding_values)
+            new_past = list(past)
+            for i in range(len(past)):
+                new_past[i] = tf.pad(past[i], padding_values)
+            # Zeros for the currently-unfilled locations in the past tensor, ones for the actual input_ids
+            attention_mask = tf.concat(
+                [
+                    attention_mask,
+                    tf.zeros(
+                        (batch_size, num_padding_values), dtype=attention_mask.dtype
+                    ),
+                    tf.ones((batch_size, 1), dtype=attention_mask.dtype),
+                ],
+                axis=1,
+            )
+        else:
+            new_past = [None for _ in range(len(past))]
+            slice_start_base = tf.constant([0, 0, 0, 1, 0])
+            attention_mask_update_slice = tf.ones(
+                (batch_size, 1), dtype=attention_mask.dtype
+            )
+            # correct 5 here
+            new_past_index = current_pos - 1
+            for i in range(len(past)):
+                update_slice = past[i][:, :, :, -1:]
+                # Write the last slice to the first open location in the padded past array
+                # and then truncate the last slice off the array
+                new_past[i] = dynamic_update_slice(
+                    past[i][:, :, :, :-1],
+                    update_slice,
+                    slice_start_base * new_past_index,
+                )
+            update_start = tf.constant([0, 1], dtype=tf.int32) * new_past_index
+            attention_mask = dynamic_update_slice(
+                attention_mask, attention_mask_update_slice, update_start
+            )
+        # set `attention_mask` and `past`
+        model_kwargs["attention_mask"] = attention_mask
+        model_kwargs["past"] = tuple(new_past)
+        return model_kwargs
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TFCausalLMOutputWithCrossAttentions,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def call(
+        self,
+        input_ids: Optional[TFModelInputType] = None,
+        remaining_frames_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        past: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
+        attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        encoder_hidden_states: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        encoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+        training: Optional[bool] = False,
+    ) -> Union[TFCausalLMOutputWithCrossAttentions, Tuple[tf.Tensor]]:
+        r"""
+        encoder_hidden_states  (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        past (`Tuple[Tuple[tf.Tensor]]` of length `config.n_layers`)
+            contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+            If `past` are used, the user can optionally input only the last `decoder_input_ids` (those that don't have
+            their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past`). Set to `False` during training, `True` during generation
+        labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the cross entropy classification loss. Indices should be in `[0, ...,
+            config.vocab_size - 1]`.
+        """
+        transformer_outputs = self.transformer(
+            input_ids=input_ids,
+            remaining_frames_ids=remaining_frames_ids,
+            past=past,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        hidden_states = transformer_outputs[0]
+        logits = self.transformer.wte(hidden_states, mode="linear")
+        loss = None
+        if labels is not None:
+            # shift labels to the left and cut last logit token
+            shifted_logits = logits[:, :-1]
+            labels = labels[:, 1:]
+            loss = self.hf_compute_loss(labels, shifted_logits)
+        if not return_dict:
+            output = (logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+        return TFCausalLMOutputWithCrossAttentions(
+            loss=loss,
+            logits=logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+            cross_attentions=transformer_outputs.cross_attentions,
+        )
+    def serving_output(self, output):
+        pkv = (
+            tf.convert_to_tensor(output.past_key_values)
+            if self.config.use_cache
+            else None
+        )
+        hs = (
+            tf.convert_to_tensor(output.hidden_states)
+            if self.config.output_hidden_states
+            else None
+        )
+        attns = (
+            tf.convert_to_tensor(output.attentions)
+            if self.config.output_attentions
+            else None
+        )
+        cross_attns = (
+            tf.convert_to_tensor(output.cross_attentions)
+            if self.config.output_attentions
+            and self.config.add_cross_attention
+            and output.cross_attentions is not None
+            else None
+        )
+        return TFCausalLMOutputWithCrossAttentions(
+            logits=output.logits,
+            past_key_values=pkv,
+            hidden_states=hs,
+            attentions=attns,
+            cross_attentions=cross_attns,
+        )